Purpose

1- Characterize health Ethiopian oral microbiota
2- Compare healthy Ethiopian oral microbiota to other geographic populations
3- Conduct cross sectional analysis of case and controls in Ethiopian cohort
4- Compare ability to predict cancer status in external cohorts

Experimental Methods

Ethiopian data is V4 16S rRNA sequencing performed on NextSeq 2000 P1 600 cycle (not XLEAP). See https://github.com/BisanzLab/OHMC_Colaboratory for wet lab methods. DNA was extracted using the Zymobiomics Mag96 extractions and libraries prepaired using the OHMC amplicon library method.

For QIIME2 processing script see https://github.com/BisanzLab/OHMC_Colaboratory/blob/main/analysis_scripts/AmpliconSeq_q2.Rmd.


R Setup

Libraries

library(tidyverse)
library(readxl)
library(qiime2R)
library(vegan)
library(ape)
library(ggtree)
library(ALDEx2)
library(Biostrings)
library(cluster)
library(dada2)

theme_set(theme_q2r()) # this will ensure that figures are stored with appropriate fonts/aesthetics
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] dada2_1.36.0          Rcpp_1.1.0            cluster_2.1.8.1      
##  [4] Biostrings_2.76.0     GenomeInfoDb_1.44.3   XVector_0.48.0       
##  [7] IRanges_2.42.0        S4Vectors_0.46.0      BiocGenerics_0.54.1  
## [10] generics_0.1.4        ALDEx2_1.40.0         latticeExtra_0.6-31  
## [13] lattice_0.22-7        zCompositions_1.5.0-5 survival_3.8-3       
## [16] truncnorm_1.0-9       MASS_7.3-65           ggtree_3.99.1        
## [19] ape_5.8-1             vegan_2.7-2           permute_0.9-8        
## [22] qiime2R_0.99.6        readxl_1.4.5          lubridate_1.9.4      
## [25] forcats_1.0.1         stringr_1.5.2         dplyr_1.1.4          
## [28] purrr_1.1.0           readr_2.1.5           tidyr_1.3.1          
## [31] tibble_3.3.0          ggplot2_4.0.1         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3          rstudioapi_0.17.1          
##   [3] jsonlite_2.0.0              magrittr_2.0.4             
##   [5] farver_2.1.2                rmarkdown_2.30             
##   [7] fs_1.6.6                    vctrs_0.6.5                
##   [9] multtest_2.64.0             Rsamtools_2.24.1           
##  [11] base64enc_0.1-3             htmltools_0.5.8.1          
##  [13] S4Arrays_1.8.1              cellranger_1.1.0           
##  [15] Rhdf5lib_1.30.0             SparseArray_1.8.1          
##  [17] Formula_1.2-5               rhdf5_2.52.1               
##  [19] gridGraphics_0.5-1          sass_0.4.10                
##  [21] bslib_0.9.0                 htmlwidgets_1.6.4          
##  [23] plyr_1.8.9                  cachem_1.1.0               
##  [25] GenomicAlignments_1.44.0    igraph_2.2.0               
##  [27] lifecycle_1.0.4             iterators_1.0.14           
##  [29] pkgconfig_2.0.3             Matrix_1.7-4               
##  [31] R6_2.6.1                    fastmap_1.2.0              
##  [33] GenomeInfoDbData_1.2.14     MatrixGenerics_1.20.0      
##  [35] digest_0.6.37               aplot_0.2.9                
##  [37] colorspace_2.1-2            ShortRead_1.66.0           
##  [39] patchwork_1.3.2             Hmisc_5.2-4                
##  [41] GenomicRanges_1.60.0        hwriter_1.3.2.1            
##  [43] timechange_0.3.0            httr_1.4.7                 
##  [45] abind_1.4-8                 mgcv_1.9-3                 
##  [47] compiler_4.5.0              fontquiver_0.2.1           
##  [49] withr_3.0.2                 htmlTable_2.4.3            
##  [51] S7_0.2.0                    backports_1.5.0            
##  [53] BiocParallel_1.42.2         DelayedArray_0.34.1        
##  [55] rappdirs_0.3.3              biomformat_1.36.0          
##  [57] tools_4.5.0                 foreign_0.8-90             
##  [59] nnet_7.3-20                 quadprog_1.5-8             
##  [61] glue_1.8.0                  nlme_3.1-168               
##  [63] rhdf5filters_1.20.0         grid_4.5.0                 
##  [65] checkmate_2.3.3             reshape2_1.4.4             
##  [67] ade4_1.7-23                 gtable_0.3.6               
##  [69] tzdb_0.5.0                  data.table_1.17.8          
##  [71] hms_1.1.4                   foreach_1.5.2              
##  [73] pillar_1.11.1               yulab.utils_0.2.1          
##  [75] splines_4.5.0               treeio_1.32.0              
##  [77] deldir_2.0-4                directlabels_2025.6.24     
##  [79] tidyselect_1.2.1            fontLiberation_0.1.0       
##  [81] knitr_1.50                  fontBitstreamVera_0.1.1    
##  [83] gridExtra_2.3               phyloseq_1.52.0            
##  [85] SummarizedExperiment_1.38.1 xfun_0.53                  
##  [87] Biobase_2.68.0              matrixStats_1.5.0          
##  [89] DT_0.34.0                   stringi_1.8.7              
##  [91] UCSC.utils_1.4.0            lazyeval_0.2.2             
##  [93] ggfun_0.2.0                 yaml_2.3.10                
##  [95] evaluate_1.0.5              codetools_0.2-20           
##  [97] interp_1.1-6                gdtools_0.4.4              
##  [99] ggplotify_0.1.3             cli_3.6.5                  
## [101] RcppParallel_5.1.11-1       rpart_4.1.24               
## [103] systemfonts_1.3.1           jquerylib_0.1.4            
## [105] zigg_0.0.2                  png_0.1-8                  
## [107] Rfast_2.1.5.2               parallel_4.5.0             
## [109] jpeg_0.1-11                 bitops_1.0-9               
## [111] pwalign_1.4.0               tidytree_0.4.6             
## [113] ggiraph_0.9.2               scales_1.4.0               
## [115] crayon_1.5.3                rlang_1.1.6

Data Import

Data was derived using v2.1 of script found here: https://github.com/BisanzLab/OHMC_Colaboratory/blob/main/analysis_scripts/AmpliconSeq_q2.Rmd.

metadata<-read_excel("inputs/EOSC_metadata_2026Feb11.xlsx")
asv_table<-readRDS("inputs/asv_table.RDS")
asv_taxonomy<-readRDS("inputs/asv_taxonomy.RDS")
asv_tree<-readRDS("inputs/asv_tree.RDS")

Copies of raw data

interactive_table(metadata)
xfun::embed_file("inputs/asv_table.RDS")
Download asv_table.RDS
xfun::embed_file("inputs/asv_taxonomy.RDS")
Download asv_taxonomy.RDS
xfun::embed_file("inputs/asv_tree.RDS")
Download asv_tree.RDS

QC

First check negative and positive control read counts.

asv_table[,subset(metadata, grepl("Control", SampleType))$SampleID] %>% colSums()
##  EOSC_ExtCon_Plate1B2 EOSC_ExtCon_Plate1G10  EOSC_ExtCon_Plate2D3 
##                   601                   629                   856 
## EOSC_ExtCon_Plate2G10  EOSC_ExtCon_Plate3C2  EOSC_ExtCon_Plate3D7 
##                   191                  1137                   641 
##  EOSC_ExtCon_Plate4B2  EOSC_ExtCon_Plate4E2  EOSC_ExtCon_Plate4F1 
##                   578                  1592                   560 
##  EOSC_ExtCon_Plate4F2    EOSC_ZymoCom_P2B10     EOSC_ZymoCom_P3C9 
##                   922                172785                 34691 
##     EOSC_ZymoCom_P4C1 EOSC_ZymoCom_Plate1D7 
##                126466                202605

There are <1592 reads in negative controls vs >34,691 in positive zymo controls. Will check what is found in negative controls below.

neg_contaminants<- asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
  filter_features(1,1) %>% rownames()
  
  
asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
  filter_features(1,1) %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
  interactive_table()

All reads in extraction controls are tell-tale kit-ome of our current lot of reagents. To be safe, will use a 10,000 read threshold for successful sequencing.

Will also look for consistent contaminants.

asv_table[,subset(metadata, SampleType=="Negative Control")$SampleID] %>%
  filter_features(1,1) %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  mutate_if(is.numeric, function(x) if_else(x>0,1,0)) %>%
  UpSetR::upset(nsets = 10)

The two conserved contaminants appear to be the same E. coli and Bordetella previously observed in this reagent batch.

Next will check composition of positive controls which are zymo community standard.

pos_contaminants<-
asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
  filter_features(1,1) %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
  filter(!Genus %in% c("Bacillus","Enterococcus","Escherichia-Shigella","Limosilactobacillus","Pseudomonas","Salmonella","Staphylococcus","Listeria"))

colSums(pos_contaminants[,2:5])
##    EOSC_ZymoCom_P2B10     EOSC_ZymoCom_P3C9     EOSC_ZymoCom_P4C1 
##                    15                     0                    11 
## EOSC_ZymoCom_Plate1D7 
##                    43
colSums(pos_contaminants[,2:5]) %>% mean()
## [1] 17.25
colSums(pos_contaminants[,2:5]) %>% sd()
## [1] 18.30073

i.e. in positive controls these putative contaminants make up less than 43 reads per sample.

asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
  filter_features(1,1) %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  left_join(asv_taxonomy %>% dplyr::select(-Sequence)) %>%
  interactive_table()

Single/double digit contaminants that are mix of background, and either seq errors or trace cross-contamination in extractions.

pos_con<-
asv_table[,subset(metadata, SampleType=="Positive Control")$SampleID] %>%
  filter_features(1,1)
  summarize_taxa(pos_con, asv_taxonomy %>% column_to_rownames("ASV"))$Species %>%
  taxa_barplot()

ggsave("figures/zymo_standard.pdf", height=4, width=8, useDingbats=F)
rm(pos_con)

Data Cleanup

Remove controls and filter for low read depth samples.

asv_table_unfiltered<-asv_table # save for later
metadata_unfiltered<-metadata
asv_table<-asv_table[,colSums(asv_table)>10000]
asv_table<-asv_table[,subset(metadata, SampleID %in% colnames(asv_table) & SampleType=="Saliva")$SampleID] %>% filter_features(1,1)
metadata<-metadata %>% filter(SampleID %in% colnames(asv_table)) %>% mutate(Group=factor(Group, levels=c("Healthy","Diseased")))
asv_table<-asv_table[,metadata$SampleID]
asv_taxonomy<-asv_taxonomy %>% filter(ASV %in% rownames(asv_table))
asv_tree<-keep.tip(asv_tree, rownames(asv_table))

Double check tree for odd topology / problematic features. Will root on Methanobrevibacteria for figures c6cb8ffae3081efef011a521c8039722.

root(asv_tree, outgroup="c6cb8ffae3081efef011a521c8039722") %>%
ggtree() %<+% asv_taxonomy +
  geom_tippoint(aes(color=Phylum))

ggsave("figures/tree.pdf", height=8, width=11.5, useDingbats=F)

Summary of sample breakdown between healthy and controls in final dataset:

metadata %>% count(Group)
## # A tibble: 2 × 2
##   Group        n
##   <fct>    <int>
## 1 Healthy    108
## 2 Diseased   103

Table S1. Potential contaminants

contaminant_reads<-
asv_table_unfiltered[unique(c(pos_contaminants$ASV, neg_contaminants)),] %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  pivot_longer(!ASV, names_to = "SampleID", values_to = "Abundance") %>%
  left_join(metadata_unfiltered) %>%
  filter(SampleType!="Exclude") %>%
  filter(SampleType %in% c("Negative Control","Positive Control") | SampleID %in% metadata$SampleID) %>% #resync tables
  mutate(SampleType=case_when(Group=="Diseased"~"Case", Group=="Healthy"~"Control", TRUE~SampleType)) %>%
  group_by(ASV, SampleType, Group) %>%
  summarize_at("Abundance", lst(mean,median,min,max,sd, length)) %>%
  mutate(stat=paste0(median," [",min,"-",max,"]")) %>%
  arrange(desc(median)) %>%
  dplyr::select(ASV, SampleType, stat) %>%
  pivot_wider(names_from = "SampleType", values_from = "stat")

contaminant_percent<-asv_table_unfiltered %>% make_percent()
contaminant_percent<-
  contaminant_percent[unique(c(pos_contaminants$ASV, neg_contaminants)),] %>%
  as.data.frame() %>%
  rownames_to_column("ASV") %>%
  pivot_longer(!ASV, names_to = "SampleID", values_to = "Abundance") %>%
  left_join(metadata_unfiltered) %>%
  filter(SampleType!="Exclude") %>%
  filter(SampleType %in% c("Negative Control","Positive Control") | SampleID %in% metadata$SampleID) %>% #resync tables
  mutate(SampleType=case_when(Group=="Diseased"~"Case", Group=="Healthy"~"Control", TRUE~SampleType)) %>%
  group_by(ASV, SampleType, Group) %>%
  summarize_at("Abundance", lst(mean,median,min,max,sd, length)) %>%
  mutate_if(is.numeric, function(x) signif(x, digits=3)) %>%
  mutate(stat=paste0(median," [",min,"-",max,"]")) %>%
  arrange(desc(median)) %>%
  dplyr::select(ASV, SampleType, stat) %>%
  pivot_wider(names_from = "SampleType", values_from = "stat")


contaminant_reads %>%
  full_join(contaminant_percent, by="ASV", suffix=c("_reads","_percent")) %>%
  left_join(asv_taxonomy %>% dplyr::select(ASV, Kingdom, Family, Genus, Species)) %>%
  interactive_table()

Table 1: Demographics

metadata %>% group_by(Group) %>% summarize_at("Age", lst(mean,sd,median,min,max))
## # A tibble: 2 × 6
##   Group     mean    sd median   min   max
##   <fct>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 Healthy   39.0  7.10     38    28    62
## 2 Diseased  51.5 13.8      50    18   105
wilcox.test(Age~Group, metadata)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age by Group
## W = 2182, p-value = 2.276e-14
## alternative hypothesis: true location shift is not equal to 0
metadata %>% group_by(Sex, Group) %>% count()
## # A tibble: 4 × 3
## # Groups:   Sex, Group [4]
##   Sex   Group        n
##   <chr> <fct>    <int>
## 1 F     Healthy     80
## 2 F     Diseased    61
## 3 M     Healthy     28
## 4 M     Diseased    42
metadata %>% group_by(Sex, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n) %>% as.data.frame() %>% column_to_rownames("Sex") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.02809
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.056013 3.682912
## sample estimates:
## odds ratio 
##   1.960841
metadata %>% group_by(Smoking, Group) %>% count()
## # A tibble: 3 × 3
## # Groups:   Smoking, Group [3]
##   Smoking Group        n
##   <chr>   <fct>    <int>
## 1 No      Healthy    108
## 2 No      Diseased   100
## 3 Yes     Diseased     3
metadata %>% group_by(Smoking, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Smoking") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.1146
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4359386       Inf
## sample estimates:
## odds ratio 
##        Inf
metadata %>% group_by(`Chewing tobacco`, Group) %>% count()
## # A tibble: 3 × 3
## # Groups:   Chewing tobacco, Group [3]
##   `Chewing tobacco` Group        n
##   <chr>             <fct>    <int>
## 1 No                Healthy    108
## 2 No                Diseased   100
## 3 Yes               Diseased     3
metadata %>% group_by(`Chewing tobacco`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Chewing tobacco") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.1146
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4359386       Inf
## sample estimates:
## odds ratio 
##        Inf
metadata %>% group_by(`Alcohol Drink`, Group) %>% count()
## # A tibble: 4 × 3
## # Groups:   Alcohol Drink, Group [4]
##   `Alcohol Drink` Group        n
##   <chr>           <fct>    <int>
## 1 No              Healthy     88
## 2 No              Diseased    97
## 3 Yes             Healthy     20
## 4 Yes             Diseased     6
metadata %>% group_by(`Alcohol Drink`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Alcohol Drink") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.005992
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.08597284 0.74784904
## sample estimates:
## odds ratio 
##  0.2737422
metadata %>% group_by(`Khat use`, Group) %>% count()
## # A tibble: 4 × 3
## # Groups:   Khat use, Group [4]
##   `Khat use` Group        n
##   <chr>      <fct>    <int>
## 1 No         Healthy    104
## 2 No         Diseased    98
## 3 Yes        Healthy      4
## 4 Yes        Diseased     5
metadata %>% group_by(`Khat use`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Khat use") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.7435
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2763203 6.8773063
## sample estimates:
## odds ratio 
##   1.324753
metadata %>% group_by(`Coffee drinking`, Group) %>% count()
## # A tibble: 3 × 3
## # Groups:   Coffee drinking, Group [3]
##   `Coffee drinking` Group        n
##   <chr>             <fct>    <int>
## 1 No                Healthy      9
## 2 Yes               Healthy     99
## 3 Yes               Diseased   103
metadata %>% group_by(`Coffee drinking`, Group) %>% count() %>% pivot_wider(names_from = Group, values_from = n, values_fill = 0) %>% as.data.frame() %>% column_to_rownames("Coffee drinking") %>% fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 0.003342
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.980536      Inf
## sample estimates:
## odds ratio 
##        Inf

Potential batch effects and impact of recruitment pairing.

dists<- asv_table %>% filter_features(1,1) %>% make_clr() %>% t() %>% dist(., method="euclidean")

mat <- as.matrix(dists)
rownames(mat) <- colnames(mat)

# Extract unique pairs (no diagonal, no duplicates)
idx <- which(upper.tri(mat), arr.ind = TRUE)

df <- data.frame(
  sample1  = rownames(mat)[idx[,1]],
  sample2  = colnames(mat)[idx[,2]],
  distance = mat[idx]
)

df<-
df %>%
  as_tibble() %>%
  arrange(sample1) %>%
  left_join(metadata %>% dplyr::select(sample1=SampleID, Pair1=Pair)) %>%
  left_join(metadata %>% dplyr::select(sample2=SampleID, Pair2=Pair)) %>%
  mutate(PairType=if_else(Pair1==Pair2 & Pair1!="Unpaired", "Intra-Pair", "Inter-Pair"))
  #filter(PairType=="Intra-Pair") %>%

  
df %>%
  ggplot(aes(x=PairType, y=distance)) +
  geom_boxplot()

df %>% t.test(distance~PairType, data=.)
## 
##  Welch Two Sample t-test
## 
## data:  distance by PairType
## t = -0.85565, df = 64.386, p-value = 0.3954
## alternative hypothesis: true difference in means between group Inter-Pair and group Intra-Pair is not equal to 0
## 95 percent confidence interval:
##  -6.754442  2.703169
## sample estimates:
## mean in group Inter-Pair mean in group Intra-Pair 
##                 126.0124                 128.0381
rm(dists, df, idx, mat)

Samples belonging to cases and the individual who accompanied them to the clinic are not significantly more similar than un-associated individuals.

To examine batch effects we will only examine extraction plate as the remainder of the processing was performed on a single 384 well plate/sequencing run.

metadata %>% group_by(Group, ExtractionPlate) %>% count() %>% pivot_wider(names_from = "Group", values_from = "n") # no discrepancies by extraction plate.
## # A tibble: 4 × 3
## # Groups:   ExtractionPlate [4]
##   ExtractionPlate Healthy Diseased
##   <chr>             <int>    <int>
## 1 Plate1               38       38
## 2 Plate2               31       26
## 3 Plate3               36       37
## 4 Plate4                3        2

There is no evidence that controls vs cancer participants were unevenly distributed across plates.

batchalpha<-
  asv_table %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
  rownames_to_column("SampleID") %>%
  dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
  full_join(
    asv_table %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
  ) %>%
  inner_join(metadata) %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity")

batchalpha %>% group_by(Metric) %>% do(aov(Diversity~ExtractionPlate, data=.) %>% broom::tidy()) %>% knitr::kable()
Metric term df sumsq meansq statistic p.value
ASV_Richness ExtractionPlate 3 1.614791e+04 5.382638e+03 0.5204398 0.6686741
ASV_Richness Residuals 207 2.140893e+06 1.034248e+04 NA NA
PhylogeneticDiversity ExtractionPlate 3 1.226250e+01 4.087502e+00 0.2525145 0.8594882
PhylogeneticDiversity Residuals 207 3.350749e+03 1.618719e+01 NA NA
ShannonDiversity ExtractionPlate 3 2.089590e-02 6.965300e-03 0.0299437 0.9930066
ShannonDiversity Residuals 207 4.815105e+01 2.326138e-01 NA NA
batchalpha %>% ggplot(aes(x=ExtractionPlate, y=Diversity, fill=ExtractionPlate)) + geom_boxplot() + facet_wrap(~Metric, scales="free") + theme(legend.position = "none")

ggsave("manuscript_figures/batch_alpha.pdf", height=3, width=6, useDingbats=F)

No evidence of impact on diversity of communities. Will check community structure.

batchdist<-
summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Genus %>%
  make_clr() %>%
  t() %>%
  dist(., method="euclidean")

batchpc<-ape::pcoa(batchdist)

batchpc$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata) %>%
    ggplot(aes(x=Axis.1, y=Axis.2, fill=ExtractionPlate)) +
    geom_point(shape=21, color="grey50") +
    theme(legend.position="bottom") +
    xlab(paste0("PC1: ",round(batchpc$values$Relative_eig[1]*100,2),"%")) +
    ylab(paste0("PC2: ",round(batchpc$values$Relative_eig[2]*100,2),"%"))

adonis2(batchdist~metadata[match(labels(batchdist), metadata$SampleID),]$ExtractionPlate) %>% print()
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = batchdist ~ metadata[match(labels(batchdist), metadata$SampleID), ]$ExtractionPlate)
##           Df SumOfSqs      R2      F Pr(>F)
## Model      3     4526 0.01591 1.1154  0.207
## Residual 207   279989 0.98409              
## Total    210   284515 1.00000
ggsave("manuscript_figures/batch_clr_euc.pdf", height=2.3, width=2, useDingbats=F)
rm(batchalpha, batchpc, batchdist)

Figure 1: Description of Healthy Oral Microbiome in Cohort

fig1_meta<-metadata %>% filter(Group=="Healthy")
fig1_asvs<-asv_table[,fig1_meta$SampleID] %>% filter_features(1,1)

Figure 1AB. Alpha Diversity

fig1_adiversity<-
  fig1_asvs %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
  rownames_to_column("SampleID") %>%
  dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
  full_join(
    fig1_asvs %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
  ) %>%
  left_join(metadata)

interactive_table(fig1_adiversity)
fig1_adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
  interactive_table()
fig1_adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  ggplot(aes(x=Diversity)) +
  geom_freqpoly(bins=20) +
  facet_wrap(~Metric, scales="free") +
  ylab("# Healhy Participants") +
  xlab("Alpha Diversity")

ggsave("manuscript_figures/healthy_alpha_diversity.pdf", height=2, width=5, useDingbats=F)

Figure 1C. Taxonomic Description

fams<-summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Family %>% make_percent()

sample_order<-dist(t(fams)) %>%  hclust(method="average") # average is upgma %>% plot()
sample_order<-sample_order$labels[sample_order$order]

q2r_palette <- c("blue4", "olivedrab", "firebrick", "gold", 
        "darkorchid", "steelblue2", "chartreuse1", "aquamarine", 
        "yellow3", "coral", "grey")

toplot<-rowMeans(fams) %>% sort(.,decreasing=TRUE) %>% names() %>% .[1:10]

fams %>%
  as.data.frame() %>%
  rownames_to_column("Taxon") %>%
  pivot_longer(!Taxon) %>%
  mutate(TaxonGroup=if_else(Taxon %in% toplot, Taxon, "Other")) %>%
  group_by(TaxonGroup, name) %>%
  summarize(value=sum(value)) %>%
  ungroup() %>%
  mutate(name=factor(name, levels=sample_order)) %>%
  mutate(TaxonGroup=factor(TaxonGroup, levels=rev(c(toplot,"Other")))) %>%
  ggplot(aes(x=name, y=value, fill=TaxonGroup)) +
  geom_bar(stat="identity") +
  scale_fill_manual(values=rev(q2r_palette), name="Family") +
  coord_cartesian(ylim=c(0,100), expand = F) +
  theme(axis.ticks.x = element_blank(), axis.text.x=element_blank()) +
  xlab("Participant") +
  ylab("% Abundance") +
  theme(legend.text = element_text(size=3))

ggsave("manuscript_figures/healthy_boxplot.pdf", height= 2, width=5)

rm(fams, sample_order, toplot)

Alternate reviewer figure showing “average” healthy composition. Will use the compositional average/center.

avgsamp<-
summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Family %>%
  make_clr() %>%
  apply(., 1, mean) %>%
  data.frame(CLR=.) %>%
  rownames_to_column("Family") %>%
  mutate(Percent=2^CLR/sum(2^CLR)*100) %>%
  arrange(desc(Percent)) %>%
  top_n(10, Percent)

avgsamp %>%
  bind_rows(
      tibble(Family="Other", CLR=NA, Percent=100-sum(avgsamp$Percent))
  ) %>%
  mutate(Family=factor(Family, levels=Family) )%>%
  ggplot(aes(x=1, y=Percent, fill=Family)) +
  geom_col(color="black") +
  coord_polar(theta="y") +
  theme_void() +
  scale_fill_manual(values=q2r_palette)

ggsave("manuscript_figures/healthy_pie.pdf", height= 2, width=10)
  
rm(avgsamp)

Figure 1D. Clusters

In healthy groups check for sub structure/community state types using PAM with validation by gap statistic. Using genus abundances here to reduce dimensionality and for later compatibility.

fig1_asvs<-summarize_taxa(fig1_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Genus

dists<-list()
  dists$`CLR Euclidean`<-fig1_asvs %>% make_clr() %>% t() %>% dist(., method="euclidean")

pcos<-lapply(dists, ape::pcoa)
Metric="CLR Euclidean"
do_pam<-function(distmat, Nclusters){list(cluster = pam(distmat,Nclusters, cluster.only=TRUE))}
gapstats<-clusGap(pcos[[Metric]]$vectors, FUN=do_pam, K.max = 10, B=100) #maximum of 13 clusters (ie 2 per cluster)

gapstats<-
  gapstats$Tab %>%
  as.data.frame() %>%
  mutate(Nclusters=1:nrow(.)) %>%
  dplyr::select(Nclusters, GapStatistic=gap, SE=SE.sim)

ggplot(gapstats, aes(x=Nclusters, y=GapStatistic, ymin=GapStatistic-SE, ymax=GapStatistic+SE)) +
  geom_errorbar(width=0) +
  geom_point() +
  scale_x_continuous(breaks=(1:10))

ggsave("manuscript_figures/health_gapstat.pdf", height=2, width=2)

clusters<-pam(dists[[Metric]], k=2, diss=TRUE, cluster.only = TRUE)
clustered_metadata<-tibble(SampleID=names(clusters), Cluster=paste0("Cluster_", clusters))
rm(clusters)

Figure 1E. PCoA of Clusters

lapply(names(pcos), function(x){
    pcos[[x]]$values %>%
      as.data.frame() %>%
      mutate(Metric=x) %>%
      mutate(Axis=1:nrow(.))
  }) %>%
  bind_rows() %>%
  filter(Axis %in% 1:3) %>%
  dplyr::select(Metric, Axis, everything()) %>%
  interactive_table()
pcos[[Metric]]$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata) %>%
    left_join(clustered_metadata) %>%
    ggplot(aes(x=Axis.1, y=Axis.2, fill=Cluster)) +
    geom_point(shape=21, color="grey50") +
    theme(legend.position="bottom") +
    xlab(paste0("PC1: ",round(pcos[[Metric]]$values$Relative_eig[1]*100,2),"%")) +
    ylab(paste0("PC2: ",round(pcos[[Metric]]$values$Relative_eig[2]*100,2),"%")) +
    scale_fill_manual(values=c("darkorange","purple4"))

ggsave("manuscript_figures/healthy_clreuc.pdf", height=2.3, width=2, useDingbats=F)


vegan::adonis2(dists[[Metric]]~Cluster, data=clustered_metadata[match(labels(dists[[Metric]]), clustered_metadata$SampleID),])
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = dists[[Metric]] ~ Cluster, data = clustered_metadata[match(labels(dists[[Metric]]), clustered_metadata$SampleID), ])
##           Df SumOfSqs      R2      F Pr(>F)    
## Model      1    24411 0.18321 23.776  0.001 ***
## Residual 106   108828 0.81679                  
## Total    107   133238 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
clustered_metadata %>% count(Cluster)
## # A tibble: 2 × 2
##   Cluster       n
##   <chr>     <int>
## 1 Cluster_1    64
## 2 Cluster_2    44
rm(Metric, gapstats)

Figure 1F. Differential diversity between Clusters.

fig1_adiversity %>%
  left_join(clustered_metadata) %>%
  ggplot(aes(x=Cluster, y=ShannonDiversity, fill=Cluster)) +
  geom_boxplot() +
  scale_fill_manual(values=c("darkorange","purple4")) +
  theme(legend.position="none") +
  coord_cartesian(ylim=c(2, 6))

ggsave("manuscript_figures/healthy_clusters_diversity.pdf", height=2, width=1, useDingbats=F)

fig1_adiversity %>%
  left_join(clustered_metadata) %>%
  t.test(ShannonDiversity~Cluster, data=.)
## 
##  Welch Two Sample t-test
## 
## data:  ShannonDiversity by Cluster
## t = 7.1238, df = 69.53, p-value = 7.657e-10
## alternative hypothesis: true difference in means between group Cluster_1 and group Cluster_2 is not equal to 0
## 95 percent confidence interval:
##  0.4302591 0.7649074
## sample estimates:
## mean in group Cluster_1 mean in group Cluster_2 
##                4.357102                3.759519

Figure 1G. Differential abundance between clusters

results<-
fig1_asvs[,clustered_metadata$SampleID] %>%
  filter_features(5,5) %>%
  aldex(reads=., clustered_metadata$Cluster, include.sample.summary = TRUE)

cleaned_results<-
  results %>% filter(we.eBH<0.1 & abs(diff.btw)>1) %>%
  arrange(diff.btw) %>%
  rownames_to_column("ASV") %>%
  dplyr::select(ASV, Cluster1_CLR=rab.win.Cluster_1, Cluster2=rab.win.Cluster_2, log2FC=diff.btw, Pvalue=we.ep, FDR=we.eBH) %>%
  arrange(log2FC)

interactive_table(cleaned_results)
write_tsv(cleaned_results, "manuscript_figures/healthy_clusters_aldex.tsv")

results %>%
  rownames_to_column("Taxon") %>%
  mutate(Label=if_else(Taxon %in% cleaned_results$ASV, Taxon,"")) %>%
  mutate(fc=case_when(
    Label!="" & diff.btw>0 ~ "Cluster 2",
    Label!="" & diff.btw<0 ~ "Cluster 1",
    TRUE ~ "ns"
         )) %>%
  mutate(Label=gsub("..+\\;","", Label)) %>%
  ggplot(aes(y=-log10(we.ep), x=diff.btw, color=fc)) +
  geom_point(shape=16, alpha=0.6) +
  ggrepel::geom_text_repel(aes(label=Label), size=1.8) +
  xlab("log2(fold change) Cluster 2 vs Cluster 1") +
  ylab("-log10(P-value)") +
  scale_color_manual(values=c("darkorange","purple4", "grey50")) +
  theme(legend.position="none")

ggsave("manuscript_figures/healthy_volcano_clusters.pdf", height=2, width=3.5, useDingbats=F)
rm(results)

Figure 1H. Differential absolute abundances between clusters.

Derived using the V6 total dual-dye 16S qPCR and normalized based on a 1:1 dilution with preservative and 5mL of saliva.

clustered_qpcr<-
read_csv("EOSC_16S_qPCR.csv") %>%
  dplyr::select(SampleID, Cq, `16S copies/mL`, `log10(16S copies/mL)`) %>%
  filter(SampleID %in% clustered_metadata$SampleID) %>%
  left_join(clustered_metadata)

interactive_table(clustered_qpcr)
clustered_qpcr %>%
  group_by(Cluster) %>%
  summarize_at("log10(16S copies/mL)", lst(mean,median,sd,min,max, length)) %>% knitr::kable()
Cluster mean median sd min max length
Cluster_1 7.532746 7.700356 0.9862739 3.166367 8.739529 64
Cluster_2 6.733028 6.679563 0.6072923 5.703809 7.959374 44
clustered_qpcr %>%
  ggplot(aes(x=Cluster, y=`log10(16S copies/mL)`, fill=Cluster)) +
  geom_boxplot() +
  scale_fill_manual(values=c("darkorange","purple4")) +
  theme(legend.position="none") +
  coord_cartesian(ylim=c(3,10))

ggsave("manuscript_figures/clustered_qPCR.pdf", height=2, width=1, useDingbats=F)

t.test(`log10(16S copies/mL)`~Cluster, data=clustered_qpcr) %>% broom::tidy()
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic    p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>      <dbl>     <dbl>    <dbl>     <dbl>
## 1    0.800      7.53      6.73      5.21    9.58e-7      105.    0.495      1.10
## # ℹ 2 more variables: method <chr>, alternative <chr>

Figure S1. PICRUST

tm<-asv_table %>% subsample_table()
tm %>% biomformat::make_biom() %>% biomformat::write_biom("asv_table.biom")
ta<-asv_taxonomy %>% filter(ASV %in% rownames(tm))
fa<-DNAStringSet(ta$Sequence)
names(fa)<-ta$ASV
writeXStringSet(fa, "asv_seqs.fa")

# see https://github.com/picrust/picrust2/wiki/Full-pipeline-script
conda activate picrust2 # picrust 2.4.1
picrust2_pipeline.py -s asv_seqs.fa -i asv_table.biom -o picrust2_out_strat -p 24 --stratified
add_descriptions.py -i picrust2_out_strat/pathways_out/path_abun_unstrat.tsv.gz -m METACYC -o picrust2_out_strat/pathways_w_desc.tsv
picrust<-read_tsv("picrust2_out_strat/pathways_w_desc.tsv")[,c("pathway","description", colnames(fig1_asvs))]
picrust<-picrust[rowSums(picrust[,3:ncol(picrust)])>0,]

picrust<-
picrust %>%
  pivot_longer(!c(pathway, description), names_to = "SampleID", values_to = "Abundance") %>%
  group_by(SampleID) %>%
  mutate(Abundance=Abundance/sum(Abundance)) %>%
  group_by(pathway, description) %>%
  mutate(Abundance=log2(Abundance+(min_nonzero(Abundance)*(2/3)))) %>%
  ungroup() %>%
  left_join(clustered_metadata) %>%
  mutate(Cluster=factor(Cluster, levels=c("Cluster_1","Cluster_2")))
  


picrust_results<-
  picrust %>%
  group_by(pathway, description) %>%
  do(
    t.test(Abundance~Cluster, data=.) %>%
      broom::tidy()
  ) %>%
  ungroup()

picrust_results<-
picrust_results %>%
  dplyr::rename(log2FC=estimate, Cluster1=estimate1, Cluster2=estimate2) %>%
  mutate(FDR=p.adjust(p.value, method="BH"))

interactive_table(picrust_results)
picrust_results %>%
  filter(FDR<0.1 & abs(log2FC)>1) %>%
  arrange(desc(log2FC)) %>%
  mutate(dir=if_else(Cluster1>Cluster2, "Cluster1", "Cluster2")) %>%
  mutate(pathway=paste0(pathway, ": ", description)) %>%
  mutate(pathway=factor(pathway, levels=rev(pathway))) %>%
  ggplot(aes(x=pathway, y=log2FC, ymin=conf.low, ymax=conf.high, color=dir)) +
  geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
  geom_errorbar(width=0) +
  geom_point(size=1) +
  coord_flip() +
  scale_color_manual(values=c("darkorange","purple4")) +
  theme(legend.position="none") +
  theme(axis.text.y=element_text(size=5)) +
  ylab("log2(fold change) (Cluster 1 vs Cluster 2") +
  xlab("")

ggsave("manuscript_figures/healthy_picrust_clusters.pdf", height=5, width=5, useDingbats=F)
rm(picrust, picrust_results)

Figure 2. Comparison against International Cohorts.

All Ethiopian data was reprocessed by Iyun for comparisons against the AGP and other cohorts which is described in Ademola-Popoola IJ, Abdul-Aziz MA, Ta CK, Barreiro LB, Perry G, Weyrich LS. Evolutionary Histories and Shared Environments Shape Oral Microbiomes in Ugandan Batwa and Bakiga Populations.

comp_table<-read_qza("inputs/weyrich_asvtable.qza")$data
dim(comp_table)
## [1] 1773  811
colSums(comp_table) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      65   10096   15947   29593   25754  324344
comp_table<-comp_table[,colSums(comp_table)>5000] %>% filter_features(1,1)
dim(comp_table)
## [1] 1773  748
colSums(comp_table) %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5019   11825   17100   31844   26766  324344
comp_meta<-read_tsv("inputs/weyrich_metadata.txt") %>% filter(sample_name %in% colnames(comp_table))
comp_meta %>%  count(country)
## # A tibble: 5 × 2
##   country       n
##   <chr>     <int>
## 1 AGP         492
## 2 Ethiopia    107
## 3 Tanzania     36
## 4 Uganda       97
## 5 Venezuela    16
comp_seqs<-read_qza("weyrichdata/Global_merged_rep_seqs.qza")$data
comp_seqs<-comp_seqs[rownames(comp_table)]

comp_taxonomy<-readRDS("inputs/weyrich_taxonomy.RDS")

comp_meta<-comp_meta %>% dplyr::select(SampleID=sample_name, Country=country)

Figure 2A. Alpha Diversity with global cohort.

comp_sub<-subsample_table(comp_table)

comp_adiv<-comp_sub %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID") %>% left_join(comp_meta)

interactive_table(comp_adiv)
comp_adiv %>%
  group_by(Country) %>%
  summarize_at("ShannonDiversity", lst(mean,median,sd,min,max)) %>%
  interactive_table()
comp_adiv %>%
  ggplot(aes(x=Country, y=ShannonDiversity, fill=Country)) +
  geom_boxplot() +
  ylab("Shannon Diversity") +
  xlab("Country") +
  theme(legend.position="none") +
  coord_cartesian(ylim=c(0, 7))

ggsave("manuscript_figures/comp_alpha_diversity.pdf", height=2, width=2.2, useDingbats=F)

fit<-comp_adiv %>% aov(ShannonDiversity~Country, data=.)
summary(fit)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Country       4  357.6   89.40   266.4 <2e-16 ***
## Residuals   743  249.4    0.34                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(fit) %>% broom::tidy() %>% interactive_table()

Figure 2B. Beta Diversity with global cohort.

comp_dist<-summarize_taxa(comp_table, as.data.frame(comp_taxonomy))$Genus %>% make_clr() %>% t() %>% dist(., method="euclidean")

comp_pc<-pcoa(comp_dist)

comp_pc$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(comp_meta) %>%
  left_join(clustered_metadata %>% mutate(SampleID=gsub("_",".", SampleID))) %>%
  mutate(Cluster=if_else(is.na(Cluster),"Other", Cluster)) %>%
  #ggplot(aes(x=Axis.1, y=Axis.2, fill=Country)) +
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Country, shape=Cluster)) +
  geom_point() +
  #geom_point(shape=21) +
  scale_shape_manual(values=c(24,25,21)) +
  xlab(paste0("PC1: ",round(comp_pc$values$Relative_eig[1]*100,2),"%")) +
  ylab(paste0("PC2: ",round(comp_pc$values$Relative_eig[2]*100,2),"%"))

ggsave("manuscript_figures/comp_pcoa.pdf", height=2, width=3, useDingbats=F)

vegan::adonis2(comp_dist~Country, data=comp_meta[match(labels(comp_dist), comp_meta$SampleID),])
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = comp_dist ~ Country, data = comp_meta[match(labels(comp_dist), comp_meta$SampleID), ])
##           Df SumOfSqs      R2      F Pr(>F)    
## Model      4   218158 0.33151 92.113  0.001 ***
## Residual 743   439925 0.66849                  
## Total    747   658083 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Figure S2. Differentially abundant families between Ethiopia and other cohorts.

comp_results<-list()
for(f in unique(comp_meta$Country)){
  if(f=="Ethiopia"){next}
  message(f)
  tm<-subset(comp_meta, Country %in% c(f,"Ethiopia")) %>% mutate(Country=if_else(Country=="Ethiopia", "z_Ethiopia", Country))
  tf<-comp_table[,tm$SampleID] %>% filter_features(10,10)
  tf<-summarize_taxa(tf, comp_taxonomy %>% as.data.frame())$Family
  comp_results[[f]]<-aldex(reads=tf, conditions = tm$Country, mc.samples = 128) %>% rownames_to_column("Taxon") %>% mutate(Contrast=paste("Ethiopia vs", f))
}

rm(tm, tf)

comp_results %>%
  bind_rows() %>%
  interactive_table()
comp_results %>%
  bind_rows() %>%
  mutate(Sig=if_else(abs(diff.btw)>1 & we.eBH<0.1, "*","ns")) %>%
  mutate(Label=if_else(Sig=="*", gsub("..+; ","", Taxon) %>% gsub("NA","", .), "")) %>%
  mutate(Pval=we.ep+min_nonzero(we.ep))  %>% # to get these on the plot
  ggplot(aes(x=diff.btw, y=-log10(Pval), color=Sig)) +
  geom_point(shape=16, alpha=0.5) +
  facet_wrap(~Contrast, ncol=2, scales="free") +
  xlab("log2Fold Change (Ethiopia vs Cohort)") +
  scale_color_manual(values=c("indianred","grey50")) +
  ggrepel::geom_text_repel(aes(label=Label), size=2) +
  theme(legend.position="none") +
  ylab("-log10(P-value)")

ggsave("manuscript_figures/comp_volcano.pdf", width=7, height=5, useDingbats=F)

Figure 3. Covariates of Ethipian microbiota structure

Table S2. Mycotoxins

covariates<-
  metadata %>%
  left_join(read_excel("inputs/mycotoxins.xlsx")) %>%
  dplyr::select(SampleID, 
                Group, 
                Sex, 
                Age, 
                CigaretteUse=Smoking, 
                ChewingTobaccoUse=`Chewing tobacco`,  
                AlcoholUse=`Alcohol Drink`, 
                KhatUse=`Khat use`, 
                CoffeeUse=`Coffee drinking`, 
                Mycotoxin_CIT=CIT,
                Mycotoxin_DON=DON,
                Mycotoxin_OTA=OTA,
                Mycotoxin_TA=TA
  ) %>%
  left_join(
    read_excel("inputs/mycotoxins.xlsx") %>%
    column_to_rownames("SampleID") %>%
    apply(., 1, function(x) sum(x>0)) %>%
    data.frame(Mycotoxin_TotalDetected=.) %>%
    rownames_to_column("SampleID")
  ) %>%
    left_join(
    read_excel("inputs/mycotoxins.xlsx") %>%
    column_to_rownames("SampleID") %>%
    apply(., 1, function(x) sum(x)) %>%
    data.frame(Mycotoxin_TotalConcentration=.) %>%
    rownames_to_column("SampleID")
  )

interactive_table(covariates)
nonzero<-function(x){sum(x>0)}

covariates %>%
  pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
  group_by(Group, Metric) %>%
  dplyr::select(-Age) %>%
  summarize_if(is.numeric, lst(mean,sd,median,min,max, length, nonzero)) %>%
  arrange(Metric) %>%
  mutate(Mean_SD=paste0(mean, " ± ", sd),
         median_range=paste0(median, " [",min," - ",max, "]")) %>%
  dplyr::select(Group, Metric, Mean_SD, median_range, nonzero) %>%
  interactive_table()
covariates %>%
  pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
  group_by(Metric) %>%
  do(
    wilcox.test(conc_ugL~Group, data=.) %>%
      broom::tidy()
  ) %>%
  interactive_table()
covariates %>%
  pivot_longer(contains("Mycotoxin"), names_to = "Metric", values_to="conc_ugL") %>%
  group_by(Group, Metric) %>%
  dplyr::select(-Age) %>%
  summarize_if(is.numeric, lst(mean,sd,median,min,max, length, detected=nonzero)) %>%
  ungroup() %>%
  mutate(notdetected=length-detected) %>%
  dplyr::select(Group, Metric, detected, notdetected) %>%
  group_by(Metric) %>%
  summarize(Fisher_Pvalue=fisher.test(matrix(c(detected, notdetected), nrow =2))$p)
## # A tibble: 6 × 2
##   Metric                       Fisher_Pvalue
##   <chr>                                <dbl>
## 1 Mycotoxin_CIT                     2.81e- 4
## 2 Mycotoxin_DON                     7.32e- 1
## 3 Mycotoxin_OTA                     1   e+ 0
## 4 Mycotoxin_TA                      4.59e-12
## 5 Mycotoxin_TotalConcentration      1   e+ 0
## 6 Mycotoxin_TotalDetected           1   e+ 0

Dropping CIT as co-variate for healthy cohort as not found in healthy group. Also, dropping total mycotoxin going forward as it is highly co-linear with the levels of TA.

Figure 3A. Alpha Diversity

Finding a model that minimizes the AIC of the model.

alpha_cov<-
fig1_adiversity %>%
  dplyr::select(SampleID, PhylogeneticDiversity, ASV_Richness, ShannonDiversity) %>%
  left_join(covariates) %>%
  filter(Group=="Healthy")

fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 161.05
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + KhatUse + 
##     CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + 
##     Mycotoxin_TotalDetected, data = alpha_cov)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.8777141  0.3459136  11.210   <2e-16 ***
## SexM                     0.2387724  0.1230321   1.941   0.0552 .  
## Age                     -0.0039520  0.0080251  -0.492   0.6235    
## AlcoholUseYes           -0.2882365  0.1409137  -2.045   0.0435 *  
## KhatUseYes               0.1995013  0.2513512   0.794   0.4293    
## CoffeeUseYes             0.1761844  0.1702886   1.035   0.3034    
## Mycotoxin_DON           -0.1991542  0.6554466  -0.304   0.7619    
## Mycotoxin_OTA            0.0461860  0.0339663   1.360   0.1770    
## Mycotoxin_TA             0.0006513  0.0019799   0.329   0.7429    
## Mycotoxin_TotalDetected  0.0913024  0.0835819   1.092   0.2773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23381)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 22.913  on  98  degrees of freedom
## AIC: 161.05
## 
## Number of Fisher Scoring iterations: 2
#khat use only 4 individuals,  remove
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 159.74
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + CoffeeUse + 
##     Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, 
##     data = alpha_cov)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.8520789  0.3437583  11.206   <2e-16 ***
## SexM                     0.2344495  0.1226815   1.911   0.0589 .  
## Age                     -0.0036321  0.0080000  -0.454   0.6508    
## AlcoholUseYes           -0.2864388  0.1406320  -2.037   0.0443 *  
## CoffeeUseYes             0.1854984  0.1695660   1.094   0.2766    
## Mycotoxin_DON           -0.1966582  0.6542131  -0.301   0.7643    
## Mycotoxin_OTA            0.0441414  0.0338051   1.306   0.1947    
## Mycotoxin_TA             0.0007249  0.0019741   0.367   0.7143    
## Mycotoxin_TotalDetected  0.0991840  0.0828347   1.197   0.2340    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2329361)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 23.061  on  99  degrees of freedom
## AIC: 159.74
## 
## Number of Fisher Scoring iterations: 2
#similarly, only 9 people don't consume coffee
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit) # 159.04
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + Mycotoxin_DON + 
##     Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data = alpha_cov)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.9970994  0.3174811  12.590   <2e-16 ***
## SexM                     0.2367042  0.1227848   1.928   0.0567 .  
## Age                     -0.0029925  0.0079864  -0.375   0.7087    
## AlcoholUseYes           -0.2968261  0.1404490  -2.113   0.0371 *  
## Mycotoxin_DON           -0.2012688  0.6548428  -0.307   0.7592    
## Mycotoxin_OTA            0.0471674  0.0337249   1.399   0.1650    
## Mycotoxin_TA             0.0005965  0.0019725   0.302   0.7630    
## Mycotoxin_TotalDetected  0.0994142  0.0829159   1.199   0.2334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2333944)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 23.339  on 100  degrees of freedom
## AIC: 159.04
## 
## Number of Fisher Scoring iterations: 2
#Mycotoxin_TotalDetected is an integer which might cause some issues, will remove
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA, data=alpha_cov)
summary(fit) # 158.58
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + Mycotoxin_DON + 
##     Mycotoxin_OTA + Mycotoxin_TA, data = alpha_cov)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.106489   0.304747  13.475   <2e-16 ***
## SexM           0.230503   0.122941   1.875   0.0637 .  
## Age           -0.001822   0.007944  -0.229   0.8190    
## AlcoholUseYes -0.287484   0.140536  -2.046   0.0434 *  
## Mycotoxin_DON  0.030134   0.627109   0.048   0.9618    
## Mycotoxin_OTA  0.046432   0.033792   1.374   0.1725    
## Mycotoxin_TA   0.001593   0.001793   0.889   0.3762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2344055)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 23.675  on 101  degrees of freedom
## AIC: 158.58
## 
## Number of Fisher Scoring iterations: 2
#Minimize to just obvious variables with larger coefficients
fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse, data=alpha_cov)
summary(fit)  # 155.62
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse, data = alpha_cov)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.247792   0.292244  14.535   <2e-16 ***
## SexM           0.243276   0.120092   2.026   0.0454 *  
## Age           -0.003609   0.007856  -0.459   0.6469    
## AlcoholUseYes -0.305634   0.139361  -2.193   0.0305 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2341533)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 24.352  on 104  degrees of freedom
## AIC: 155.62
## 
## Number of Fisher Scoring iterations: 2
fit<-glm(ShannonDiversity~Sex + AlcoholUse, data=alpha_cov)
summary(fit) # 153.84
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + AlcoholUse, data = alpha_cov)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.11599    0.05541  74.283   <2e-16 ***
## SexM           0.22650    0.11397   1.987   0.0495 *  
## AlcoholUseYes -0.32979    0.12858  -2.565   0.0117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2323939)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 24.401  on 105  degrees of freedom
## AIC: 153.84
## 
## Number of Fisher Scoring iterations: 2

Below, will show all tested coefficients, but will only report sex and alcohol use as significant based on final model.

fit<-glm(ShannonDiversity~Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data=alpha_cov)
summary(fit)
## 
## Call:
## glm(formula = ShannonDiversity ~ Sex + Age + AlcoholUse + KhatUse + 
##     CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + 
##     Mycotoxin_TotalDetected, data = alpha_cov)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.8777141  0.3459136  11.210   <2e-16 ***
## SexM                     0.2387724  0.1230321   1.941   0.0552 .  
## Age                     -0.0039520  0.0080251  -0.492   0.6235    
## AlcoholUseYes           -0.2882365  0.1409137  -2.045   0.0435 *  
## KhatUseYes               0.1995013  0.2513512   0.794   0.4293    
## CoffeeUseYes             0.1761844  0.1702886   1.035   0.3034    
## Mycotoxin_DON           -0.1991542  0.6554466  -0.304   0.7619    
## Mycotoxin_OTA            0.0461860  0.0339663   1.360   0.1770    
## Mycotoxin_TA             0.0006513  0.0019799   0.329   0.7429    
## Mycotoxin_TotalDetected  0.0913024  0.0835819   1.092   0.2773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.23381)
## 
##     Null deviance: 26.220  on 107  degrees of freedom
## Residual deviance: 22.913  on  98  degrees of freedom
## AIC: 161.05
## 
## Number of Fisher Scoring iterations: 2
summary(fit)$coefficients %>%
  as.data.frame() %>%
  rownames_to_column("Covariate") %>%
  filter(Covariate!="(Intercept)") %>%
  arrange(Estimate) %>%
  mutate(Covariate=factor(Covariate, levels=Covariate)) %>%
  ggplot(aes(x=Covariate, y=Estimate, ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`, label=paste("P=", round(`Pr(>|t|)`,2)))) +
  geom_hline(yintercept = 0, linetype="dashed", color="grey50") +
  geom_point() +
  geom_errorbar(width=0) +
  geom_text(size=2) +
  coord_flip() +
  ylab("Effect on Shannon's Diversity (Estimate ± SE)")

ggsave("manuscript_figures/covariates_healthy.pdf", height=2, width=2.5, useDingbats=F)

Figure 3B. Boxplot of Acohol and Sex as covariates.

alpha_cov %>% 
  ggplot(aes(x=Sex, y=ShannonDiversity, fill=AlcoholUse)) +
  #geom_boxplot(outlier.alpha=0,) +
  stat_summary(geom="bar", position=position_dodge(), color="black") +
  geom_point(position=position_jitterdodge(jitter.width=0.2, jitter.height=0), shape=21) +
  coord_cartesian(ylim=c(2, 6)) +
  scale_fill_manual(values=c("grey50","indianred"))

ggsave("manuscript_figures/sex_alcohol.pdf", height=2, width=2.5, useDingbats=F)

fit<-alpha_cov %>% aov(ShannonDiversity~AlcoholUse*Sex, data=.)
summary(fit)
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## AlcoholUse       1  0.900  0.9005   3.840 0.0527 .
## Sex              1  0.918  0.9178   3.914 0.0505 .
## AlcoholUse:Sex   1  0.012  0.0116   0.049 0.8245  
## Residuals      104 24.390  0.2345                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Figure 3C. ADONIS

vegan::adonis2(dists$`CLR Euclidean`~Sex + Age +  AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected,
               data=covariates[match(labels(dists$`CLR Euclidean`), covariates$SampleID),],by="terms", permutations=9999, parallel=10) %>% print()
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
## 
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Sex + Age + AlcoholUse + KhatUse + CoffeeUse + Mycotoxin_DON + Mycotoxin_OTA + Mycotoxin_TA + Mycotoxin_TotalDetected, data = covariates[match(labels(dists$`CLR Euclidean`), covariates$SampleID), ], permutations = 9999, by = "terms", parallel = 10)
##                          Df SumOfSqs      R2      F Pr(>F)  
## Sex                       1     2370 0.01778 1.9098 0.0320 *
## Age                       1      996 0.00748 0.8031 0.6750  
## AlcoholUse                1      997 0.00748 0.8035 0.6759  
## KhatUse                   1     1380 0.01036 1.1121 0.2786  
## CoffeeUse                 1      835 0.00627 0.6728 0.8852  
## Mycotoxin_DON             1     1117 0.00838 0.9000 0.5290  
## Mycotoxin_OTA             1     2082 0.01563 1.6784 0.0563 .
## Mycotoxin_TA              1      992 0.00744 0.7993 0.6790  
## Mycotoxin_TotalDetected   1      874 0.00656 0.7046 0.8412  
## Residual                 98   121596 0.91262                
## Total                   107   133238 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Figure 3D. Sex by PC2

pcos$`CLR Euclidean`$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata) %>%
    left_join(clustered_metadata) %>%
    ggplot(aes(x=Sex, y=Axis.2, fill=Sex)) +
    geom_boxplot() +
    #stat_summary(geom="bar", position=position_dodge(), color="black") +
    geom_point(position=position_jitterdodge(jitter.width=0.2, jitter.height=0), shape=21) +
    ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%")) +
  coord_cartesian(ylim=c(-20, 35)) +
  scale_fill_manual(values=c("grey50","dodgerblue"))

ggsave("manuscript_figures/sex.pdf", heigh=2, width=2, useDingbats=F)

pcos$`CLR Euclidean`$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata) %>%
    left_join(clustered_metadata) %>%
  t.test(Axis.2~Sex, data=.)
## 
##  Welch Two Sample t-test
## 
## data:  Axis.2 by Sex
## t = -2.5498, df = 37.359, p-value = 0.01501
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -10.29551  -1.17964
## sample estimates:
## mean in group F mean in group M 
##       -1.487520        4.250057

Figure 4. Cross-sectional analysis by cancer status

Figure 4ABC. Alpha Diversity

adiversity<-
  asv_table %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
  rownames_to_column("SampleID") %>%
  dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
  full_join(
    asv_table %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
  ) %>%
  left_join(metadata)

interactive_table(adiversity)
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric, Group) %>%
  summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
  interactive_table()
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  ggplot(aes(x=Group, y=Diversity, fill=Group)) +
  geom_boxplot() +
  facet_wrap(~Metric, scales="free") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  theme(legend.position="none") +
  xlab("") +
  ylab("") +
  scale_fill_manual(values=c("cornflowerblue","indianred"))

ggsave("manuscript_figures/cross_section_alpha_diversity.pdf", height=2, width=3, useDingbats=F)

adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    t.test(Diversity~Group, data=.) %>%
      broom::tidy()
  ) %>%
  knitr::kable()
Metric estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
ASV_Richness 60.3993168 289.574074 229.174757 4.536983 0.0000096 207.6511 34.1540188 86.6446148 Welch Two Sample t-test two.sided
PhylogeneticDiversity 2.5903518 16.557070 13.966718 4.959897 0.0000015 208.9131 1.5607783 3.6199253 Welch Two Sample t-test two.sided
ShannonDiversity 0.1950222 4.116715 3.921693 3.019831 0.0028451 208.4917 0.0677077 0.3223367 Welch Two Sample t-test two.sided
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    wilcox.test(Diversity~Group, data=.) %>%
      broom::tidy()
  ) %>%
  knitr::kable()
Metric statistic p.value method alternative
ASV_Richness 7447.5 0.0000212 Wilcoxon rank sum test with continuity correction two.sided
PhylogeneticDiversity 7557.0 0.0000068 Wilcoxon rank sum test with continuity correction two.sided
ShannonDiversity 6892.0 0.0027084 Wilcoxon rank sum test with continuity correction two.sided
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    glm(Diversity~Group + Age + Sex + `Alcohol Drink`, data=.) %>%
      broom::tidy()
  ) %>%
  knitr::kable()
Metric term estimate std.error statistic p.value
ASV_Richness (Intercept) 306.7554582 26.2328232 11.6935740 0.0000000
ASV_Richness GroupDiseased -56.3860313 16.1213214 -3.4976061 0.0005750
ASV_Richness Age -0.3643145 0.6593569 -0.5525300 0.5811843
ASV_Richness SexM -4.5604812 15.7309539 -0.2899049 0.7721806
ASV_Richness Alcohol DrinkYes -9.7430378 21.9441927 -0.4439916 0.6575147
PhylogeneticDiversity (Intercept) 17.6813408 1.0249613 17.2507394 0.0000000
PhylogeneticDiversity GroupDiseased -2.1844359 0.6298876 -3.4679771 0.0006384
PhylogeneticDiversity Age -0.0282538 0.0257622 -1.0967146 0.2740466
PhylogeneticDiversity SexM -0.2064235 0.6146353 -0.3358472 0.7373279
PhylogeneticDiversity Alcohol DrinkYes 0.1625281 0.8573972 0.1895598 0.8498407
ShannonDiversity (Intercept) 4.2067288 0.1259261 33.4063259 0.0000000
ShannonDiversity GroupDiseased -0.2117943 0.0773876 -2.7367985 0.0067462
ShannonDiversity Age -0.0018969 0.0031651 -0.5993137 0.5496223
ShannonDiversity SexM 0.0905621 0.0755137 1.1992807 0.2317964
ShannonDiversity Alcohol DrinkYes -0.2137518 0.1053393 -2.0291749 0.0437270
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  filter(Metric=="ASV_Richness") %>%
  do(
    glm.nb(Diversity~Group + Age + Sex + `Alcohol Drink`, data=.) %>%
      broom::tidy()
  )  %>%
  knitr::kable()
term estimate std.error statistic p.value
(Intercept) 5.7514750 0.1086749 52.9236803 0.0000000
GroupDiseased -0.2133090 0.0667491 -3.1956840 0.0013950
Age -0.0017937 0.0027327 -0.6563806 0.5115793
SexM -0.0284362 0.0651401 -0.4365396 0.6624453
Alcohol DrinkYes -0.0266784 0.0908267 -0.2937282 0.7689656

Figure 4D. Beta Diversity

genus_sum<-summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Genus

dists<-list()
  dists$`CLR Euclidean`<-genus_sum %>% make_clr() %>% t() %>% dist(., method="euclidean")

pcos<-lapply(dists, ape::pcoa)

lapply(names(pcos), function(x){
    pcos[[x]]$values %>%
      as.data.frame() %>%
      mutate(Metric=x) %>%
      mutate(Axis=1:nrow(.))
  }) %>%
  bind_rows() %>%
  filter(Axis %in% 1:3) %>%
  dplyr::select(Metric, Axis, everything()) %>%
  interactive_table()
  pcos$`CLR Euclidean`$vectors %>%
    as.data.frame() %>%
    rownames_to_column("SampleID") %>%
      left_join(metadata) %>%
    ggplot(aes(x=Axis.1, y=Axis.2, fill=Group)) +
    geom_point(shape=21, color="black") +
    theme(legend.position="bottom") +
    xlab(paste0("PC1: ",round(pcos$`CLR Euclidean`$values$Relative_eig[1]*100,2),"%")) +
    ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%")) +
    scale_fill_manual(values=c("cornflowerblue","indianred"))

  ggsave("manuscript_figures/cancer_pcoa_aitchison.pdf", height=2.2, width=2)


vegan::adonis2(dists$`CLR Euclidean`~Group + Age + Sex + `Alcohol Drink`, data=metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID),], by="terms") %>% print()
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Group + Age + Sex + `Alcohol Drink`, data = metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID), ], by = "terms")
##                  Df SumOfSqs      R2       F Pr(>F)    
## Group             1    13037 0.04582 10.0316  0.001 ***
## Age               1     1261 0.00443  0.9699  0.405    
## Sex               1     1394 0.00490  1.0725  0.288    
## `Alcohol Drink`   1     1098 0.00386  0.8447  0.594    
## Residual        206   267725 0.94099                   
## Total           210   284515 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Figure 4E. Clusters

Metric="CLR Euclidean"
gapstats<-clusGap(pcos[[Metric]]$vectors, FUN=do_pam, K.max = 10, B=100)

gapstats<-
  gapstats$Tab %>%
  as.data.frame() %>%
  mutate(Nclusters=1:nrow(.)) %>%
  dplyr::select(Nclusters, GapStatistic=gap, SE=SE.sim)

ggplot(gapstats, aes(x=Nclusters, y=GapStatistic, ymin=GapStatistic-SE, ymax=GapStatistic+SE)) +
  geom_errorbar(width=0) +
  geom_point() +
  scale_x_continuous(breaks=(1:10))

clusters<-pam(dists[[Metric]], k=2, diss=TRUE, cluster.only = TRUE)
clustered_metadata_all<-tibble(SampleID=names(clusters), Cluster_all=paste0("Cluster_", clusters))

clustered_metadata %>% left_join(clustered_metadata_all) %>% interactive_table() #they are flipped relative to previous assignments to 1 vs 2, will re-arranged.
clustered_metadata_all<-clustered_metadata_all %>% mutate(Cluster=if_else(Cluster_all=="Cluster_1", "Cluster_2","Cluster_1"))

metadata %>% left_join(clustered_metadata_all) %>%
  ggplot(aes(x=Group, fill=Cluster)) +
  geom_bar(color="black") +
  scale_fill_manual(values=c("darkorange","purple4")) +
  ylab("# Participants")

  ggsave("manuscript_figures/cancer_clusters.pdf", height=2, width=2)

  
  metadata %>% left_join(clustered_metadata_all) %>%
    group_by(Group, Cluster) %>%
    summarize(N=n()) %>%
    pivot_wider(names_from = "Cluster", values_from = "N") %>%
    knitr::kable()
Group Cluster_1 Cluster_2
Healthy 70 38
Diseased 37 66
  metadata %>% left_join(clustered_metadata_all) %>%
    group_by(Group, Cluster) %>%
    summarize(N=n()) %>%
    pivot_wider(names_from = "Cluster", values_from = "N") %>%
    dplyr::select(1,3,2) %>%
    as.data.frame() %>%
    column_to_rownames("Group") %>%
    .[c(2,1),] %>% #flip order to change how OR is described
    fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value = 3.36e-05
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.801207 6.009433
## sample estimates:
## odds ratio 
##   3.266217

Now will confirm with multivariable logistic regression

fit<-
metadata %>% left_join(clustered_metadata_all) %>%
  mutate(DiseaseBin=if_else(Group=="Diseased", 1, 0)) %>%
  glm(DiseaseBin~ Cluster + Age + Sex + `Alcohol Drink`, family=binomial, data=.)

summary(fit)  
## 
## Call:
## glm(formula = DiseaseBin ~ Cluster + Age + Sex + `Alcohol Drink`, 
##     family = binomial, data = .)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -6.70210    1.03267  -6.490 8.58e-11 ***
## ClusterCluster_2    1.06720    0.35347   3.019  0.00253 ** 
## Age                 0.14581    0.02426   6.010 1.86e-09 ***
## SexM                0.20069    0.42789   0.469  0.63905    
## `Alcohol Drink`Yes -2.61517    0.65653  -3.983 6.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 292.39  on 210  degrees of freedom
## Residual deviance: 197.33  on 206  degrees of freedom
## AIC: 207.33
## 
## Number of Fisher Scoring iterations: 5
broom::tidy(fit, exponentiate = TRUE, conf.int=TRUE) %>% knitr::kable()
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.0012283 1.0326684 -6.4900813 0.0000000 0.0001390 0.0080986
ClusterCluster_2 2.9072295 0.3534698 3.0192132 0.0025343 1.4671815 5.8979473
Age 1.1569814 0.0242626 6.0098498 0.0000000 1.1069684 1.2178495
SexM 1.2222476 0.4278900 0.4690259 0.6390511 0.5252974 2.8410213
Alcohol DrinkYes 0.0731554 0.6565310 -3.9833142 0.0000680 0.0183859 0.2464655

Figure S3. Cross-section by ESCC vs EAC

Figure S3ABC. Alpha Diversity

cancer_asvs<-asv_table[,subset(metadata, Group=="Diseased")$SampleID] %>% filter_features(1,1)

adiversity<-
  cancer_asvs %>% subsample_table() %>% t() %>% picante::pd(., asv_tree) %>%
  rownames_to_column("SampleID") %>%
  dplyr::select(SampleID, PhylogeneticDiversity="PD", ASV_Richness="SR") %>%
  full_join(
    cancer_asvs %>% subsample_table() %>% t() %>% diversity(., "shannon") %>% data.frame(ShannonDiversity=.) %>% rownames_to_column("SampleID")
  ) %>%
  left_join(metadata)

interactive_table(adiversity)
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric, Histology) %>%
  summarize_at("Diversity", lst(mean,median,sd,min,max)) %>%
  interactive_table()
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  ggplot(aes(x=Histology, y=Diversity, fill=Histology)) +
  geom_boxplot() +
  facet_wrap(~Metric, scales="free") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  theme(legend.position="none") +
  xlab("") +
  ylab("")

ggsave("manuscript_figures/histology_alpha_diversity.pdf", height=2, width=3, useDingbats=F)

adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    t.test(Diversity~Histology, data=.) %>%
      broom::tidy()
  ) %>%
  knitr::kable()
Metric estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
ASV_Richness -6.5387931 224.875000 231.413793 -0.2675944 0.7916005 21.17681 -57.3292898 44.251704 Welch Two Sample t-test two.sided
PhylogeneticDiversity -0.1683577 13.923964 14.092322 -0.1634073 0.8717653 20.93045 -2.3114069 1.974691 Welch Two Sample t-test two.sided
ShannonDiversity -0.0122405 3.912341 3.924581 -0.1023655 0.9194256 21.26808 -0.2607219 0.236241 Welch Two Sample t-test two.sided
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    wilcox.test(Diversity~Histology, data=.) %>%
      broom::tidy()
  ) %>%
  knitr::kable()
Metric statistic p.value method alternative
ASV_Richness 680.5 0.8913695 Wilcoxon rank sum test with continuity correction two.sided
PhylogeneticDiversity 688.0 0.9455600 Wilcoxon rank sum test with continuity correction two.sided
ShannonDiversity 680.0 0.8877758 Wilcoxon rank sum test with continuity correction two.sided
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  group_by(Metric) %>%
  do(
    glm(Diversity~Histology + Age + Sex + `Alcohol Drink`, data=.) %>%
      broom::tidy()
  )  %>%
  knitr::kable()
Metric term estimate std.error statistic p.value
ASV_Richness (Intercept) 280.7962636 42.3845654 6.6249650 0.0000000
ASV_Richness HistologyESCC 0.2271049 24.7288868 0.0091838 0.9926912
ASV_Richness Age -0.7582168 0.6805618 -1.1141043 0.2679594
ASV_Richness SexM -30.3218279 19.3841082 -1.5642622 0.1209795
ASV_Richness Alcohol DrinkYes 14.3069739 38.8977330 0.3678100 0.7138081
PhylogeneticDiversity (Intercept) 17.3448585 1.7359790 9.9913987 0.0000000
PhylogeneticDiversity HistologyESCC -0.1751294 1.0128411 -0.1729091 0.8630795
PhylogeneticDiversity Age -0.0517987 0.0278743 -1.8582938 0.0661289
PhylogeneticDiversity SexM -1.2657805 0.7939306 -1.5943212 0.1140843
PhylogeneticDiversity Alcohol DrinkYes 0.9229716 1.5931660 0.5793317 0.5636947
ShannonDiversity (Intercept) 3.9630797 0.2134718 18.5648865 0.0000000
ShannonDiversity HistologyESCC 0.0060704 0.1245482 0.0487392 0.9612263
ShannonDiversity Age -0.0005871 0.0034277 -0.1712953 0.8643448
ShannonDiversity SexM -0.0334025 0.0976289 -0.3421375 0.7329801
ShannonDiversity Alcohol DrinkYes -0.0284999 0.1959102 -0.1454745 0.8846350
adiversity %>%
  pivot_longer(c("PhylogeneticDiversity","ASV_Richness","ShannonDiversity"), names_to = "Metric", values_to = "Diversity") %>%
  filter(Metric=="ASV_Richness") %>%
  do(
    glm.nb(Diversity~Histology + Age + Sex + `Alcohol Drink`, data=.) %>%
      broom::tidy()
  )  %>%
  knitr::kable()
term estimate std.error statistic p.value
(Intercept) 5.6414444 0.1978516 28.5135157 0.0000000
HistologyESCC 0.0028321 0.1154336 0.0245346 0.9804262
Age -0.0030589 0.0031781 -0.9624956 0.3358007
SexM -0.1317515 0.0904872 -1.4560238 0.1453860
Alcohol DrinkYes 0.0599459 0.1815510 0.3301875 0.7412583

Figure S3D. Beta Diversity

genus_sum<-summarize_taxa(cancer_asvs, asv_taxonomy %>% column_to_rownames("ASV"))$Genus

dists<-list()
  dists$`CLR Euclidean`<-genus_sum %>% make_clr() %>% t() %>% dist(., method="euclidean")

  pcos<-lapply(dists, ape::pcoa)

  pcos$`CLR Euclidean`$vectors %>%
    as.data.frame() %>%
    rownames_to_column("SampleID") %>%
      left_join(metadata) %>%
    ggplot(aes(x=Axis.1, y=Axis.2, fill=Histology)) +
    geom_point(shape=21, color="black") +
    theme(legend.position="bottom") +
    xlab(paste0("PC1: ",round(pcos$`CLR Euclidean`$values$Relative_eig[1]*100,2),"%")) +
    ylab(paste0("PC2: ",round(pcos$`CLR Euclidean`$values$Relative_eig[2]*100,2),"%"))

  ggsave("manuscript_figures/histology_pcoa_aitchison.pdf", height=2.2, width=2)


vegan::adonis2(dists$`CLR Euclidean`~Histology + Age + Sex + `Alcohol Drink`, data=metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID),])
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = dists$`CLR Euclidean` ~ Histology + Age + Sex + `Alcohol Drink`, data = metadata[match(labels(dists$`CLR Euclidean`), metadata$SampleID), ])
##           Df SumOfSqs      R2      F Pr(>F)
## Model      4     6121 0.04581 1.1763  0.141
## Residual  98   127478 0.95419              
## Total    102   133599 1.00000

Figure 5. Biomarkers between Cases and Controls

Table S3. Differential species

genus_sums<-summarize_taxa(asv_table, asv_taxonomy %>% column_to_rownames("ASV"))$Species # disregard variable name of genus, species abundances were carried forward for greater specificity
genus_sums<-genus_sums[,metadata$SampleID] %>% filter_features(20,20) # must be in at least 20 samples

clr<-aldex.clr(genus_sums, model.matrix(~Group + Sex + Age + `Alcohol Drink`, metadata), mc.samples = 128, denom = "all")

aldex_results<-aldex.glm(clr, fdr.method="BH")

aldex_results %>%
  rownames_to_column("Taxon") %>%
  dplyr::select(Taxon, Group_Estimate=`GroupDiseased:Est`, Group_Pvalue=`GroupDiseased:pval`, Group_FDR=`GroupDiseased:pval.padj`) %>%
  filter(Group_FDR<0.1) %>%
  arrange(desc(Group_Estimate)) %>%
  interactive_table()

Figure 5A. Cladogram of significantly different genera

treedat<-
aldex_results %>%
  rownames_to_column("Taxon") %>%
  dplyr::select(Taxon, Group_Estimate=`GroupDiseased:Est`, Group_SE=`GroupDiseased:SE`, Group_Pvalue=`GroupDiseased:pval`, Group_FDR=`GroupDiseased:pval.padj`) %>%
  filter(Group_FDR<0.1) %>%
  mutate(Taxon=paste0("root; ", Taxon)) %>%
  separate(Taxon, c("Root","Kingdom","Phylum","Class","Order","Family","Genus","Species"), sep="; ", remove = F) %>%
  mutate(Species=paste0("S",1:nrow(.),":",Genus," ", Species)) %>%
  mutate_if(is.character, function(x) if_else(x=="NA", "Not Assigned", x)) %>% #replace empty character columns with Not assigned
  mutate(pathString=paste(Root, Kingdom, Phylum, Class, Order, Family, Genus, Species, sep="/")) %>% #create a new column pathString by concatenating values from Kingdom, Phylum, Class, Order, Family, and Genus columns, separated by "/".
  arrange(pathString)

clado <-
treedat %>%
  dplyr::select(pathString, Taxon) %>%
  data.tree::as.Node() %>%
  data.tree::as.phylo.Node()
  

plottree<-ggtree( clado, layout="circular") %<+% (treedat %>% mutate(Snum=gsub(":..+","", Species)) %>% dplyr::select(Snum, everything())) +
  geom_tippoint(aes(fill=Group_Estimate, size=-log10(Group_FDR)), shape=21) +
  geom_tiplab(aes(label=gsub("NA", "sp.", Species) %>% gsub("S[0-9]+:","",.)), size=2, offset=0.7) +
  scale_fill_gradient2(low="cornflowerblue",high="darkred") +
  theme(legend.position="bottom") 
  

interest<-c("Clostridia","Veillonellales-Selenomonadales","Lactobacillales","Actinobacteriota","Bacteroidota","Gammaproteobacteria")
for( i in interest){
  message(i)
  tips<-treedat %>% filter(grepl(i, pathString)) %>% pull(Species) %>% gsub(":..+","", .)
  nd<-getMRCA(phy=clado, tip=as.character(tips))
  plottree<- 
    plottree + 
    #geom_hilight(
    #node=nd,
    #fill="grey50",
    #alpha=0.2
    #) +
    geom_cladelabel(node=nd, label=i, offset = 10)
}

plottree

ggsave("manuscript_figures/cancer_cladogram.pdf", height=6, width=6, useDingbats=F)

Table S4. PICRUST Case vs Control

picrust<-read_tsv("picrust2_out_strat/pathways_w_desc.tsv")
picrust<-picrust[,c("pathway","description", colnames(asv_table))]

picrust_results<-
picrust %>%
  pivot_longer(!c(pathway, description), names_to = "SampleID", values_to = "Abundance") %>%
  group_by(SampleID) %>%  
  mutate(Abundance=(Abundance/sum(Abundance)))  %>% # convert to proportion... shouldn't be necessary as already subsampled.
  group_by(pathway) %>%
  mutate(Abundance=log2(Abundance+(min_nonzero(Abundance)*(2/3)))) %>%
  ungroup() %>%
  left_join(metadata) %>%
  group_by(pathway, description) %>%
  do(
    glm(Abundance~Group + Sex + Age + `Alcohol Drink`, data=.) %>%
      broom::tidy()
  ) %>%
  group_by(term) %>%
  mutate(FDR=p.adjust(p.value, method="BH")) %>%
  ungroup()

picrust_results %>%
  filter(term!="(Intercept)" & FDR<0.1) %>%
  interactive_table() # no significant confounders

Will now extract the taxa driving these changes. Will only extract the most abundant taxa driving

sigpath<-
picrust_results %>%
  filter(term=="GroupDiseased" & FDR<0.1) %>%
  pull(pathway)

pathstrat<-read_tsv("picrust2_out_strat/pathways_out/path_abun_contrib.tsv.gz") %>% dplyr::rename(Pathway=`function`) %>% filter(Pathway %in% sigpath)

contribs<-
pathstrat %>% 
  dplyr::rename(ASV=taxon) %>% 
  left_join(asv_taxonomy) %>%
  group_by(Genus, Pathway) %>%
  summarize(mean=mean(taxon_rel_function_abun)) %>%
  group_by(Pathway) %>%
  top_n(5, mean) %>%
  arrange(desc(mean)) %>%
  summarize(Genera=paste(Genus, collapse = ", "))

picrust_results %>%
  filter(term!="(Intercept)" & FDR<0.1) %>%
  left_join(contribs %>% dplyr::rename(pathway=Pathway)) %>%
  interactive_table() # no significant confounders

Figure 5B. Picrust vs canacer

lineage<-read_tsv("/data/shared_resources/databases/metacyc/25.1/map_metacyc-pwy_lineage.tsv", col_names=c("ID","Lineage")) #dug up from https://forum.biobakery.org/t/metacyc-hierarchy-to-invetigate-identify-specific-pathways/1830/4

plot_results<-picrust_results %>%filter(term=="GroupDiseased" & FDR<0.1)

#gplots::venn(list(meta=lineage$ID, pathways=plot_results$pathway))
#gplots::venn(list(meta=lineage$ID, pathways=plot_results$pathway), show.plot = F) %>% print() #3 pathways missing from annotation

#attr(,"intersections")$pathways
#[1] "GLYCOLYSIS-TCA-GLYOX-BYPASS" "GLYOXYLATE-BYPASS"           "TCA-GLYOX-BYPASS"       
#These are superpathways being excluded

lineage<-lineage %>% filter(ID %in% plot_results$pathway & Lineage!="Super-Pathways")
lineage<-lineage %>% mutate(Lineage=paste0("Origin|", Lineage,"|", ID))

tree<-
lineage %>%
  filter(!duplicated(ID)) %>% # remove duplicates
  data.tree::FromDataFrameTable(., pathName="Lineage", pathDelimiter = "|")
tree<-igraph::as.igraph(tree)       
lout<-ggraph::create_layout(tree, layout="igraph", circular=TRUE, algorithm="tree")

lout<-lout %>% left_join(plot_results %>% dplyr::rename(name=pathway) %>% mutate(Annotation=paste(name, description, sep=": ")))

lout$Annotation=if_else(is.na(lout$Annotation), lout$name, lout$Annotation)
lout$Annotation<-gsub("−","-", lout$Annotation)

ggraph::ggraph(lout) +
  ggraph::geom_edge_diagonal() +
  ggraph::geom_node_point(aes(fill=estimate, size=-log10(p.value)), shape=21) +
  ggraph::geom_node_text(aes(label=Annotation), size=2) +
  scale_fill_gradient2(low="darkblue",high="darkred") +
  theme_void() +
  theme(legend.position="bottom")

ggsave("manuscript_figures/case_v_control.pdf", height=12, width=24, useDingbats=F)         

Figure S4. Comparison of Esophageal cancer cohorts

Data Processing

Validation cohorts: V3-V4: Chen PRJNA961904 https://bmcmicrobiol.biomedcentral.com/articles/10.1186/s12866-024-03233-4 31 patients and 21 healthy controls, China. (early-stage intramucosal esophageal squamous carcinoma). 52 are saliva. Sample names and numbers reveal metadata. 31 SAL must be saliva cancer, and the 21 SALNC must be no cancer. V3-V4: Jiang PRJNA853196 https://link.springer.com/article/10.1007/s00432-022-04393-4 oral swabs, control = 53, ESCC = 56. metadata available

Given issues with denoising NovaSeq data and the larger amplicon V3-V4 having problematic features for overlap, especially from Novaseq, will carry forward reverse read only.

mkdir external
mkdir external/PRJNA961904
mkdir external/PRJNA853196

export PATH=$PATH:/data/shared_resources/sftwr/sratoolkit.3.0.0-ubuntu64/bin
#vdb-config -i # caching to ~/.prefetch_cache/

while read line; do
  SRR=$(echo $line | cut -d "," -f 1)
  PRJ=$(echo $line | cut -d "," -f 5)
  SAMN=$(echo $line | cut -d "," -f 6)
  echo $(date): $SRR from $SAMN from $PRJ
  
  mkdir -p external/$PRJ/$SAMN
  fasterq-dump --outdir external/$PRJ/$SAMN --threads 6 --mem 32G $SRR
  if [ -f "external/$PRJ/$SAMN/${SRR}_1.fastq" ]; then echo "...Success"; else echo "...FAILURE"; fi
done < external/PRJNA961904_SraRunTable_filt.csv

while read line; do
  SRR=$(echo $line | cut -d "," -f 1)
  PRJ=$(echo $line | cut -d "," -f 5)
  SAMN=$(echo $line | cut -d "," -f 6)
  echo $(date): $SRR from $SAMN from $PRJ
  
  mkdir -p external/$PRJ/$SAMN
  fasterq-dump --outdir external/$PRJ/$SAMN --threads 6 --mem 32G $SRR
  if [ -f "external/$PRJ/$SAMN/${SRR}_1.fastq" ]; then echo "...Success"; else echo "...FAILURE"; fi
done < external/PRJNA853196_SraRunTable_filt.csv

Process PRJNA853196

PCR amplification
16S rDNA genes of V3–V4 region were amplified using universal primers, 
namely 338F (5′-ACTCCTACGGGAGGCAGCAG-3′) and 806R (5′ -GGACTACHVGGGTWTCTAAT-3′). 
All PCR reactions (including denaturation, annealing, and elongation) were carried out with 
Phusion® High-Fidelity PCR Master Mix (New England Biolabs). 
After electrophoresis of PCR products, samples with bright main strip between 
400 and 450 bp were chosen for the next mixing and purification with 
Qiagen Gel Extraction Kit (Qiagen, Germany).

Sequencing processing and analysis
Purified amplicons were pooled in equimolar and paired-end sequenced on an Illumina Novaseq6000 PE250 
platform (Illumina, San Diego, USA). The raw data were filtered with QIIME (v 1.8.0), discarding the 
reads which were de-replicated or shorter than 150 bp. Filtered reads were clustered into OTUs with the
assumed 97% similarity. Compared with the SILVA database (version 128), the species classification 
information of each OTU was obtained. All raw reads were stored in NCBI Sequence Read Archive (SRA) database, 
and the accession number is PRJNA853196.
library(tidyverse)
library(dada2)
library(Biostrings)
dir.create("external/PRJNA853196/filtered")
prj_meta<-
  read_csv("external/PRJNA853196_SraRunTable_filt.csv") %>% dplyr::select(SampleID=BioSample, BioProject, Run, Group) %>%
  mutate(R1=paste0("external/PRJNA853196/", SampleID,"/", Run,"_1.fastq")) %>%
  mutate(R2=paste0("external/PRJNA853196/", SampleID,"/", Run,"_2.fastq")) %>%
  mutate(R1_filt=gsub("SAMN[0-9]+", "filtered", R1)) %>%
  mutate(R2_filt=gsub("SAMN[0-9]+", "filtered", R2))

for(i in prj_meta$R1){message(file.exists(i))}
for(i in prj_meta$R2){message(file.exists(i))}

plotQualityProfile(prj_meta$R1[1:3])
plotQualityProfile(prj_meta$R2[1:3])

Biostrings::readDNAStringSet(prj_meta$R1[1], format="fastq") # 220 x 220, primers appear to have been removed
Biostrings::readDNAStringSet(prj_meta$R2[1], format="fastq")


dmp<-
  filterAndTrim(
    fwd=prj_meta$R1,
    filt=prj_meta$R1_filt,
    rev=prj_meta$R2,
    filt.rev=prj_meta$R2_filt,
    compress=FALSE,
    trimLeft = c(0,0), # primer lengths as the primer seqs are still intact in read
    truncLen = c(200,200),
    multithread = 16,
    rm.phix=TRUE
  )

errF <- learnErrors(prj_meta$R1_filt, multithread=16)
errR <- learnErrors(prj_meta$R2_filt, multithread=16)
dadaFs <- dada(prj_meta$R1_filt, err=errF, multithread=16)
dadaRs <- dada(prj_meta$R2_filt, err=errR, multithread=16)
mergers <- mergePairs(dadaFs, prj_meta$R1_filt, dadaRs, prj_meta$R2_filt, verbose=TRUE)
#seqtab <- makeSequenceTable(mergers)
#dim(seqtab)
#sum(seqtab)
seqtab <- makeSequenceTable(dadaRs)
dim(seqtab)
sum(seqtab)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=16, verbose=TRUE)
sum(seqtab.nochim)/sum(seqtab)

asv_table<-t(seqtab.nochim) %>% qiime2R::filter_features(2,2)
asv_taxonomy<-tibble(ASV=paste0("ASV_", 1:nrow(asv_table)), Sequence=rownames(asv_table))
asv_taxonomy$Sequence<-asv_taxonomy$Sequence %>% DNAStringSet() %>% reverseComplement() %>% as.character()

taxonomy <- assignTaxonomy(asv_taxonomy$Sequence, "/data/shared_resources/databases/Q2_2023.5_db/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=24)
taxonomy <- addSpecies(taxonomy, "/data/shared_resources/databases/Q2_2023.5_db/silva_species_assignment_v138.1.fa.gz", allowMultiple=TRUE)
  
asv_taxonomy<-asv_taxonomy %>% left_join(taxonomy %>% as.data.frame() %>% dplyr::rename(Species_Ambiguous=8) %>% rownames_to_column("Sequence"))

rownames(asv_table)<-asv_taxonomy$ASV
colnames(asv_table)<-gsub("_[12]\\.fastq","", colnames(asv_table))



write_tsv(as.data.frame(asv_table) %>% rownames_to_column("ASV"), "external/PRJNA853196/PRJNA853196_asvtable.tsv")
write_tsv(asv_taxonomy, "external/PRJNA853196/PRJNA853196_asvtaxonomy.tsv")

Process PRJNA961904

Microbial genomic DNA was extracted using the CTAB Method and stored at -20 °C. 
The V3-V4 hypervariable regions, approximately 468 bp in length, were amplified by polymerase chain reaction 
(Primers: 338 F (5′-ACTCCTACGGGAGGCAGCA-3′) and 806R (5′-GGACTACHVGGGTWTCTAAT-3′). 
PCR samples were quantified using the Quant-it PicoGreen dsDNA Assay Kit on a Microplate reader (BioTek, FLx800) and 
mixed based on the amount of data for each sample. DNA library construction was performed using the TruSeq Nano DNA LT 
Library Prep Kit (Illumina Scientific Co., Ltd). The quantity and quality of each library were measured using the Quant-iT 
PicoGreen dsDNA Assay Kit and Agilent High Sensitivity DNA Kit, respectively. For qualified libraries, the NovaSeq 6000 SP Reagent Kit (500 cycles)
on the Illumina NovaSeq machine was used for 2 × 250 bp double-ended sequencing.
Raw data from the 16s rRNA sequence was processed by Personalbio Technology Co., Ltd. (Shanghai, China). 
Amplicon sequence variants (ASVs) were integrated into merged taxonomic abundance and taxonomy classification tables. 
All amplicon raw data have been submitted to the Sequence Read Archive (SRA) in NCBI (Archive number: PRJNA961904).
library(tidyverse)
library(dada2)
library(Biostrings)
dir.create("external/PRJNA961904/filtered")
prj_meta<-
  read_csv("external/PRJNA961904_SraRunTable_filt.csv") %>% dplyr::select(SampleID=BioSample, BioProject, Run, Group) %>%
  mutate(R1=paste0("external/PRJNA961904/", SampleID,"/", Run,"_1.fastq")) %>%
  mutate(R2=paste0("external/PRJNA961904/", SampleID,"/", Run,"_2.fastq")) %>%
  mutate(R1_filt=gsub("SAMN[0-9]+", "filtered", R1)) %>%
  mutate(R2_filt=gsub("SAMN[0-9]+", "filtered", R2))

for(i in prj_meta$R1){message(file.exists(i))}
for(i in prj_meta$R2){message(file.exists(i))}

plotQualityProfile(prj_meta$R1[1:3])
plotQualityProfile(prj_meta$R2[1:3])

Biostrings::readDNAStringSet(prj_meta$R1[1], format="fastq") # primers present and 251 x 243
Biostrings::readDNAStringSet(prj_meta$R2[1], format="fastq")


dmp<-
  filterAndTrim(
    fwd=prj_meta$R1,
    filt=prj_meta$R1_filt,
    rev=prj_meta$R2,
    filt.rev=prj_meta$R2_filt,
    compress=FALSE,
    trimLeft = c(19,20), # primer lengths as the primer seqs are still intact in read
    truncLen = c(210,200),
    multithread = 16,
    rm.phix=TRUE
  )

errF <- learnErrors(prj_meta$R1_filt, multithread=16)
errR <- learnErrors(prj_meta$R2_filt, multithread=16)
dadaFs <- dada(prj_meta$R1_filt, err=errF, multithread=16)
dadaRs <- dada(prj_meta$R2_filt, err=errR, multithread=16)
mergers <- mergePairs(dadaFs, prj_meta$R1_filt, dadaRs, prj_meta$R2_filt, verbose=TRUE)
#seqtab <- makeSequenceTable(mergers)
#dim(seqtab)
#sum(seqtab)
seqtab <- makeSequenceTable(dadaRs)
dim(seqtab)
sum(seqtab)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=16, verbose=TRUE)
sum(seqtab.nochim)/sum(seqtab)

asv_table<-t(seqtab.nochim) %>% qiime2R::filter_features(2,2)
asv_taxonomy<-tibble(ASV=paste0("ASV_", 1:nrow(asv_table)), Sequence=rownames(asv_table))
asv_taxonomy$Sequence<-asv_taxonomy$Sequence %>% DNAStringSet() %>% reverseComplement() %>% as.character()

taxonomy <- assignTaxonomy(asv_taxonomy$Sequence, "/data/shared_resources/databases/Q2_2023.5_db/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=24)
taxonomy <- addSpecies(taxonomy, "/data/shared_resources/databases/Q2_2023.5_db/silva_species_assignment_v138.1.fa.gz", allowMultiple=TRUE)
  
asv_taxonomy<-asv_taxonomy %>% left_join(taxonomy %>% as.data.frame() %>% dplyr::rename(Species_Ambiguous=8) %>% rownames_to_column("Sequence"))

rownames(asv_table)<-asv_taxonomy$ASV
colnames(asv_table)<-gsub("_[12]\\.fastq","", colnames(asv_table))



write_tsv(as.data.frame(asv_table) %>% rownames_to_column("ASV"), "external/PRJNA961904/PRJNA961904_asvtable.tsv")
write_tsv(asv_taxonomy, "external/PRJNA961904/PRJNA961904_asvtaxonomy.tsv")

Figure S4B. Beta Diversity

mulisa<-list()
mulisa$meta<-metadata %>% dplyr::select(SampleID, Group) %>% mutate(Dataset="Mulisa") %>% mutate(Group=if_else(Group=="Diseased", "Case","Control"))
mulisa$asv<-asv_table
mulisa$tax<-asv_taxonomy
mulisa$genus<-summarize_taxa(mulisa$asv, mulisa$tax %>% column_to_rownames("ASV"))$Genus
mulisa$family<-summarize_taxa(mulisa$asv, mulisa$tax %>% column_to_rownames("ASV"))$Family
dim(mulisa$genus)
## [1] 486 211
jiang<-list()
jiang$meta<-read_csv("external/PRJNA853196_SraRunTable_filt.csv") %>% dplyr::select(SampleID=Run, Group) %>% mutate(Dataset="Jiang")
jiang$asv<-read_tsv("external/PRJNA853196/PRJNA853196_asvtable.tsv") %>% column_to_rownames("ASV")
jiang$tax<-read_tsv("external/PRJNA853196/PRJNA853196_asvtaxonomy.tsv")
jiang$genus<-summarize_taxa(jiang$asv, jiang$tax %>% column_to_rownames("ASV"))$Genus
jiang$family<-summarize_taxa(jiang$asv, jiang$tax %>% column_to_rownames("ASV"))$Family
dim(jiang$genus)
## [1] 532 109
chen<-list()
chen$meta<-read_csv("external/PRJNA961904_SraRunTable_filt.csv") %>% dplyr::select(SampleID=Run, Group) %>% mutate(Dataset="Chen")
chen$asv<-read_tsv("external/PRJNA961904/PRJNA961904_asvtable.tsv") %>% column_to_rownames("ASV")
chen$tax<-read_tsv("external/PRJNA961904/PRJNA961904_asvtaxonomy.tsv")
chen$genus<-summarize_taxa(chen$asv, chen$tax %>% column_to_rownames("ASV"))$Genus
chen$family<-summarize_taxa(chen$asv, chen$tax %>% column_to_rownames("ASV"))$Family
dim(chen$genus)
## [1] 325  52
merged_meta<-bind_rows(mulisa$meta, chen$meta, jiang$meta)

merged_genus<-
bind_rows(
mulisa$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
chen$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
jiang$genus %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts")
) %>%
  pivot_wider(names_from = SampleID, values_from = Counts, values_fill = 0) %>%
  column_to_rownames("Taxon")

pc<-
make_clr(merged_genus) %>%
  t() %>%
  dist(., method="euclidean") %>%
  pcoa()

pc$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(merged_meta) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, color=Dataset, shape=Group)) + 
  geom_point(alpha=0.5) +
    xlab(paste0("PC1: ",round(pc$values$Relative_eig[1]*100,2),"%")) +
    ylab(paste0("PC2: ",round(pc$values$Relative_eig[2]*100,2),"%")) +
  scale_shape_manual(values=c(16, 1))

ggsave("manuscript_figures/pco_cross_comparisons.pdf", height=2, width=3, useDingbats=F)

merged_dist<-
make_clr(merged_genus) %>%
  t() %>%
  dist(., method="euclidean")

adonis2(merged_dist~Group*Dataset, data=merged_meta[match(labels(merged_dist), merged_meta$SampleID),], parallel=12)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = merged_dist ~ Group * Dataset, data = merged_meta[match(labels(merged_dist), merged_meta$SampleID), ], parallel = 12)
##           Df SumOfSqs     R2      F Pr(>F)    
## Model      5   107018 0.1997 18.266  0.001 ***
## Residual 366   428879 0.8003                  
## Total    371   535897 1.0000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Figure S4C. Alpha Diversity

merged_alpha<-bind_rows(
diversity(chen$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID"),
diversity(jiang$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID"), 
diversity(mulisa$asv %>% subsample_table(), "shannon", MARGIN=2) %>% data.frame(Shannon=.) %>% rownames_to_column("SampleID") 
) %>%
  left_join(merged_meta)


merged_alpha %>%
  group_by(Dataset) %>%
  do(
   t.test(Shannon~Group, data=.) %>%
     broom::tidy()
  ) %>%
  knitr::kable()
Dataset estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high method alternative
Chen -0.0673069 3.422203 3.489509 -0.632158 0.5302942 47.82611 -0.2814026 0.1467888 Welch Two Sample t-test two.sided
Jiang 0.1023905 3.356753 3.254363 1.810364 0.0730558 106.62091 -0.0097336 0.2145146 Welch Two Sample t-test two.sided
Mulisa -0.1950222 3.921693 4.116715 -3.019831 0.0028451 208.49174 -0.3223367 -0.0677077 Welch Two Sample t-test two.sided
merged_alpha %>%
  mutate(Group=factor(Group, levels=c("Control","Case"))) %>%
  ggplot(aes(x=Dataset, y=Shannon, fill=Group)) +
  geom_boxplot() +
  scale_fill_manual(values=c("cornflowerblue","indianred")) +
  xlab("Cohort") +
  ylab("Shannon's Diversity Index") +
  theme(legend.position="none")

ggsave("manuscript_figures/cohort_alpha.pdf", height=2, width=3)

Figure S4A. Alpha Diversity

merged_family<-
bind_rows(
mulisa$family %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
chen$family %>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts"),
jiang$family%>% rownames_to_column("Taxon") %>% pivot_longer(!Taxon, names_to = "SampleID", values_to = "Counts")
) %>%
  pivot_wider(names_from = SampleID, values_from = Counts, values_fill = 0) %>%
  column_to_rownames("Taxon")


taxa_barplot(merged_family, merged_meta, category="Dataset") +
  theme(axis.text.x = element_blank(), axis.ticks.x=element_blank())

ggsave("manuscript_figures/cohort_barplot.pdf", height=3, width=8, useDingbats=F)

Figure 6. Cross-cohort comparisons

Figure 6A. ROC curves

library(randomForest)
library(ROCR)
library(pROC)

ROCR_ROCs<-list()
pROC_ROCs<-list()
AUCs<-list()
Importance<-list()
splits<-list()

merged_genus_rf<-merged_genus %>% filter_features(20,20) %>% make_proportion()

for(i in 1:100){
  message(i)
  set.seed(as.numeric(paste0("202407",i)))
  mulisa$meta<-mulisa$meta %>% sample_n(nrow(.)) %>% mutate(Dataset=c(rep("Training", 139), rep("Test", 72)))
  splits[[i]]<-mulisa$meta %>% group_by(Group, Dataset) %>% count() %>% pivot_wider(names_from = Group, values_from = n) %>% mutate(Iteration=i)
  RFmeta<-bind_rows(mulisa$meta, jiang$meta, chen$meta) %>% mutate(Group=factor(Group)) 
  RFdata<-merged_genus_rf[,RFmeta$SampleID] %>% t()
  
  fit<-randomForest(x=RFdata[subset(RFmeta, Dataset=="Training")$SampleID,],
               y=subset(RFmeta, Dataset=="Training")$Group,
               importance=TRUE
  )
  
Importance[[i]]<-
  fit$importance %>%
    as.data.frame() %>%
    rownames_to_column("Taxon") %>%
    arrange(desc("MeanDecreaseGini")) %>%
    mutate(Iteration=i) %>%
    dplyr::select(Iteration, Taxon, everything()) %>%
    as_tibble()
  

  for(Task in c("Training","Test","Jiang","Chen")){
      tm<-predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
          prediction(., subset(RFmeta, Dataset==Task)$Group) %>%
          performance(., "tpr","fpr")
      ROCR_ROCs[[paste0(Task,"_", i)]]<-
          tibble(Iteration=i, Task=Task, FPR=unlist(tm@x.values), TPR=unlist(tm@y.values))
      #https://stackoverflow.com/questions/54642823/how-to-get-specific-tpr-from-a-sequence-of-fpr-in-r
      pROC_ROCs[[paste0(Task,"_", i)]]<-
        predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
        roc(subset(RFmeta, Dataset==Task)$Group, .) %>%
        coords(., x=1-seq(0,1,0.02), input="specificity", ret = c("se", "sp")) %>%
          tibble(Iteration=i, Task=Task, FPR=specificity, TPR=sensitivity) %>%
          mutate(FPR=1-FPR)

      tm<-predict(fit, newdata=RFdata[subset(RFmeta, Dataset==Task)$SampleID,], type="prob")[,2] %>%
          prediction(., subset(RFmeta, Dataset==Task)$Group) %>%
          performance(., "auc")
      AUCs[[paste0(Task,"_", i)]]<-tibble(Iteration=i, Task=Task, AUC=tm@y.values[[1]])
    
  }
}

Check class balance of sampled splits.

bind_rows(splits) %>% filter(Dataset=="Training") %>% mutate(Ratio=Case/Control) %>% summarise_at("Ratio", lst(mean,median,min,max,sd))
## # A tibble: 1 × 6
##   Dataset   mean median   min   max     sd
##   <chr>    <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 Training 0.961  0.958 0.805  1.17 0.0823
bind_rows(ROCR_ROCs) %>%
  filter(Task!="Training") %>%
  ggplot(aes(x=FPR, y=TPR, color=Task, group=paste(Task, Iteration))) +
  geom_abline(linetype="dashed", color="grey50") +
  geom_line()

ggsave("manuscript_figures/ROC_indiv.pdf", height=2, width=3, useDingbats=F)


bind_rows(pROC_ROCs) %>%
  filter(Task!="Training") %>%
  ggplot(aes(x=FPR, y=TPR, fill=Task)) +
  geom_abline(linetype="dashed", color="grey50") +
  stat_summary(geom="ribbon", alpha=0.5, fun.data=mean_sd) +
  stat_summary(geom="line", linetype="dashed")

ggsave("manuscript_figures/ROC_ribbon.pdf", height=2, width=3, useDingbats=F)

AUC values and Importance Scores for inset:

bind_rows(AUCs) %>% group_by(Task) %>% summarize(AUC_mean=mean(AUC), AUC_median=median(AUC), AUC_sd=sd(AUC), AUC_min=min(AUC), AUC_max=max(AUC)) %>% interactive_table()
bind_rows(AUCs) %>% group_by(Task) %>% summarize(AUC_mean=mean(AUC), AUC_median=median(AUC), AUC_sd=sd(AUC), AUC_min=min(AUC), AUC_max=max(AUC)) %>% write_tsv("manuscript_figures/AUROCS.tsv")

bind_rows(Importance) %>%
  group_by(Taxon) %>%
  summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
  arrange(desc(mean)) %>%
  interactive_table()

Figure 6B. Importance vs Rank

bind_rows(Importance) %>%
  group_by(Taxon) %>%
  summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
  arrange(desc(mean)) %>%
  mutate(Rank=1:nrow(.)) %>%
  mutate(Taxon=factor(Taxon, levels=rev(Taxon))) %>%
  ggplot(aes(x=Rank, y=mean, ymin=mean-sd, ymax=mean+sd)) +
  geom_ribbon(fill="grey80") +
  geom_line(linetype="dashed", color="grey20") +
  #geom_errorbar(width=0) +
  #geom_point() +
  coord_flip() +
  scale_x_reverse() +
  ylab("Mean Decrease GINI (mean ± SD)")

ggsave("manuscript_figures/Importance_all.pdf", height=2, width=2, useDingbats=F)

Figure 6C. Importance vs Rank - top 20

bind_rows(Importance) %>%
  group_by(Taxon) %>%
  summarize_at("MeanDecreaseGini", lst(mean,sd,median,min,max)) %>%
  arrange(desc(mean)) %>%
  mutate(Rank=1:nrow(.)) %>%
  filter(Rank<20) %>%
  mutate(Taxon=factor(Taxon, levels=rev(Taxon))) %>%
  ggplot(aes(x=Taxon, y=mean, ymin=mean-sd, ymax=mean+sd)) +
  geom_errorbar(width=0) +
  geom_point() +
  coord_flip()  +
  ylab("Mean Decrease GINI (mean ± SD)")

ggsave("manuscript_figures/Importance_top20.pdf", height=2, width=5, useDingbats=F)